kashif
/

Qwen3-0.6B-h128-l4-a16-ctx256-pred64-vocab4096-MeanScaleUniform-lr1.0e-05-bs16-steps1000

Text Generation

Generated from Trainer

text-generation-inference

Model card Files Files and versions

Qwen3-0.6B-h128-l4-a16-ctx256-pred64-vocab4096-MeanScaleUniform-lr1.0e-05-bs16-steps1000 / training_config.json

kashif's picture

kashif HF Staff

Upload 4 files

074b52e verified 5 months ago

history blame contribute delete

1.06 kB

	{
	"pretrained_model_name_or_path": "Qwen/Qwen3-0.6B",
	"output_dir": "./outputs",
	"seed": 42,
	"tf32": true,
	"model_type": "causal",
	"vocab_size": 4096,
	"hidden_size": 256,
	"num_hidden_layers": 8,
	"num_attention_heads": 8,
	"context_length": 256,
	"prediction_length": 64,
	"tokenizer_class": "NonUniformBins",
	"binning_power": 2.0,
	"exponential_base": 1.01,
	"n_special_tokens": 2,
	"pad_token_id": 0,
	"eos_token_id": 1,
	"use_eos_token": true,
	"min_past": 64,
	"drop_prob": 0.1,
	"shuffle_buffer_length": 20000,
	"per_device_train_batch_size": 32,
	"learning_rate": 0.0001,
	"max_steps": 40000,
	"warmup_ratio": 0.1,
	"lr_scheduler_type": "cosine",
	"optim": "adamw_torch",
	"gradient_accumulation_steps": 4,
	"log_steps": 20,
	"save_steps": 100,
	"dataloader_num_workers": 8,
	"torch_compile": true,
	"logger": "<Logger __main__ (INFO)>",
	"total_train_batch_size": 128,
	"short_model_name": "Qwen3-0.6B",
	"run_name": "Qwen3-0.6B-h256-l8-a8-ctx256-pred64-vocab4096-NonUniform-lr1.0e-04-bs128-steps40000"
	}