amd
/

Safetensors
llama
alignment-handbook
Generated from Trainer
Zebra-Llama-1B-4MLA-12Mamba-DPO / hybrid_config.json
Mingyuyang-1's picture
Upload folder using huggingface_hub
f72b259 verified
raw
history blame
1.16 kB
{
"hidden_size": 2048,
"intermediate_size": 8192,
"hidden_act": "silu",
"n_layer": 16,
"mla_layers": [
0,
5,
10,
14
],
"rms_norm_eps": 1e-05,
"num_attention_heads": 32,
"num_key_value_heads": 8,
"kv_lora_rank": 128,
"q_lora_rank": 1344,
"use_lora_layer_norm": false,
"use_fixed_rank_for_first_and_last_block": true,
"use_full_kv_head": false,
"layer_rank_list": {},
"qk_rope_head_dim": 32,
"v_head_dim": 64,
"qk_nope_head_dim": 32,
"q_energy_ratio": null,
"kv_energy_ratio": null,
"qkv_rank_divisor": 8,
"max_position_embeddings": 131072,
"rope_theta": 500000.0,
"rope_scaling": {
"factor": 32.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"attention_bias": false,
"attention_dropout": 0.0,
"rope_type": "yarn",
"d_model": 2048,
"ssm_cfg": {
"expand": 1,
"ngroups": 32,
"d_state": 64,
"repeat_kv_before_conv": false
},
"d_inner": 2048,
"d_xb": 512
}