{ "max_audio_seconds": 1800, "stride_size": 2, "avg_pooler": 2, "d_model": 1280, "scale_embedding": false, "kernel_size": 3, "activation_function": "gelu", "encoder_layers": 32, "encoder_skip_layer_id": 3, "encoder_attention_heads": 20, "encoder_ffn_dim": 5120, "encoder_causal": false, "encoder_attn_window_size": [ -1, -1 ], "decoder_layers": 32, "decoder_attention_heads": 20, "decoder_ffn_dim": 5120, "decoder_kernel_size": 3, "decoder_stride_size": 2, "decoder_causal": true, "decoder_attn_window_size": [ -1, -1 ], "nfft": 960, "vocoder_dim": 256, "vocoder_intermediate_dim": 1024, "vocoder_num_layers": 16, "n_mels": 128, "sampling_rate": 24000, "hop_length": 240, "window_size": 960, "vocoder_padding": "same", "fmin": 0, "fmax": null, "num_quantizers": 20, "codebook_size": [ 1024, 1024, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 ], "threshold_ema_dead_code": 2, "position_embedding_type": "rope", "rope_theta": 10000, "rope_type": "default", "ln_type": "LayerNorm", "vocoder_attention_heads": 16, "vocoder_attn_window_size": [ 40, 10 ] }