Qwen3-0.6B-h128-l4-a16-ctx256-pred64-vocab4096-MeanScaleUniform-lr1.0e-05-bs16-steps1000
/
training_config.json
| { | |
| "pretrained_model_name_or_path": "Qwen/Qwen3-0.6B", | |
| "output_dir": "./outputs", | |
| "seed": 42, | |
| "tf32": true, | |
| "model_type": "causal", | |
| "vocab_size": 4096, | |
| "hidden_size": 256, | |
| "num_hidden_layers": 8, | |
| "num_attention_heads": 8, | |
| "context_length": 256, | |
| "prediction_length": 64, | |
| "tokenizer_class": "NonUniformBins", | |
| "binning_power": 2.0, | |
| "exponential_base": 1.01, | |
| "n_special_tokens": 2, | |
| "pad_token_id": 0, | |
| "eos_token_id": 1, | |
| "use_eos_token": true, | |
| "min_past": 64, | |
| "drop_prob": 0.1, | |
| "shuffle_buffer_length": 20000, | |
| "per_device_train_batch_size": 32, | |
| "learning_rate": 0.0001, | |
| "max_steps": 40000, | |
| "warmup_ratio": 0.1, | |
| "lr_scheduler_type": "cosine", | |
| "optim": "adamw_torch", | |
| "gradient_accumulation_steps": 4, | |
| "log_steps": 20, | |
| "save_steps": 100, | |
| "dataloader_num_workers": 8, | |
| "torch_compile": true, | |
| "logger": "<Logger __main__ (INFO)>", | |
| "total_train_batch_size": 128, | |
| "short_model_name": "Qwen3-0.6B", | |
| "run_name": "Qwen3-0.6B-h256-l8-a8-ctx256-pred64-vocab4096-NonUniform-lr1.0e-04-bs128-steps40000" | |
| } |