| { | |
| "max_audio_seconds": 1800, | |
| "stride_size": 2, | |
| "avg_pooler": 2, | |
| "d_model": 1280, | |
| "scale_embedding": false, | |
| "kernel_size": 3, | |
| "activation_function": "gelu", | |
| "encoder_layers": 32, | |
| "encoder_skip_layer_id": 3, | |
| "encoder_attention_heads": 20, | |
| "encoder_ffn_dim": 5120, | |
| "encoder_causal": false, | |
| "encoder_attn_window_size": [ | |
| -1, | |
| -1 | |
| ], | |
| "decoder_layers": 32, | |
| "decoder_attention_heads": 20, | |
| "decoder_ffn_dim": 5120, | |
| "decoder_kernel_size": 3, | |
| "decoder_stride_size": 2, | |
| "decoder_causal": true, | |
| "decoder_attn_window_size": [ | |
| -1, | |
| -1 | |
| ], | |
| "nfft": 960, | |
| "vocoder_dim": 256, | |
| "vocoder_intermediate_dim": 1024, | |
| "vocoder_num_layers": 16, | |
| "n_mels": 128, | |
| "sampling_rate": 24000, | |
| "hop_length": 240, | |
| "window_size": 960, | |
| "vocoder_padding": "same", | |
| "fmin": 0, | |
| "fmax": null, | |
| "num_quantizers": 20, | |
| "codebook_size": [ | |
| 1024, | |
| 1024, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128, | |
| 128 | |
| ], | |
| "threshold_ema_dead_code": 2, | |
| "position_embedding_type": "rope", | |
| "rope_theta": 10000, | |
| "rope_type": "default", | |
| "ln_type": "LayerNorm", | |
| "vocoder_attention_heads": 16, | |
| "vocoder_attn_window_size": [ | |
| 40, | |
| 10 | |
| ] | |
| } |