| { | |
| "architectures": [ | |
| "VocosEncodecModel" | |
| ], | |
| "bandwidths": [ | |
| 1.5, | |
| 3.0, | |
| 6.0, | |
| 12.0 | |
| ], | |
| "codebook_dim": 128, | |
| "dtype": "float32", | |
| "hidden_act": "gelu", | |
| "hidden_size": 384, | |
| "hop_length": 320, | |
| "intermediate_size": 1152, | |
| "istft_padding": "same", | |
| "kernel_size": 7, | |
| "layer_norm_eps": 1e-06, | |
| "layer_scale_init_value": 0.125, | |
| "model_type": "vocos_encodec", | |
| "n_fft": 1280, | |
| "num_layers": 8, | |
| "num_quantizers": 16384, | |
| "padding": 3, | |
| "sample_rate": 24000, | |
| "transformers_version": "4.57.0.dev0" | |
| } | |