Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +48 -3
config.json +79 -48
tokenizer_config.json +11 -6
vocab.json +56 -0

README.md CHANGED Viewed

@@ -1,5 +1,50 @@
-# TTSMMS Model - grn
-Text-to-speech model from the Massively Multilingual Speech (MMS) project
-This model was converted from the original MMS VITS model for use with 🤗 Transformers.

+---
+language: gn
+tags:
+- guarani
+- tts
+- speech
+- tts-mms
+license: mit
+datasets:
+- mozilla-foundation/common_voice_11_0
+---
+# Guarani TTS-MMS Model
+This is a Text-to-Speech model for the Guarani language, based on the META Massive Multilingual Speech (MMS) architecture.
+## Model Description
+This model is designed for Guarani text-to-speech synthesis, utilizing the TTS-MMS architecture. It can generate natural-sounding speech from Guarani text input.
+## Usage
+python
+from transformers import AutoProcessor, AutoModel
+processor = AutoProcessor.from_pretrained("joselobenitezg/mms-grn-tts")
+model = AutoModel.from_pretrained("joselobenitezg/mms-grn-tts")
+Example usage
+text = "Mba'éichapa"
+inputs = processor(text=text, return_tensors="pt")
+speech = model.generate(inputs)
+# Training Data
+The model was trained using:
+- Guarani Common Voice dataset
+- [Add other data sources if applicable]
+## Model Architecture
+The model uses the TTS-MMS architecture with the following key components:
+- Encoder-decoder architecture
+- Self-attention mechanisms
+- [Add specific architectural details]
+## Limitations
+- [List any known limitations]
+- [Add performance considerations]

config.json CHANGED Viewed

@@ -1,52 +1,83 @@
 {
-  "architectures": [
-    "SynthesizerTrn"
-  ],
-  "model_type": "ttsmms",
-  "vocab_size": 53,
-  "spec_channels": 513,
-  "segment_size": 32,
-  "inter_channels": 192,
-  "hidden_channels": 192,
-  "filter_channels": 768,
-  "n_heads": 2,
-  "n_layers": 6,
-  "kernel_size": 3,
-  "p_dropout": 0.1,
-  "resblock": "1",
-  "resblock_kernel_sizes": [
-    3,
-    7,
-    11
-  ],
-  "resblock_dilation_sizes": [
-    [
-      1,
-      3,
-      5
     ],
-    [
-      1,
-      3,
-      5
     ],
-    [
-      1,
       3,
-      5
-    ]
-  ],
-  "upsample_rates": [
-    8,
-    8,
-    2,
-    2
-  ],
-  "upsample_initial_channel": 512,
-  "upsample_kernel_sizes": [
-    16,
-    16,
-    4,
-    4
-  ]
-}

 {
+    "activation_dropout": 0.1,
+    "architectures": [
+      "VitsModel"
     ],
+    "attention_dropout": 0.1,
+    "depth_separable_channels": 2,
+    "depth_separable_num_layers": 3,
+    "duration_predictor_dropout": 0.5,
+    "duration_predictor_filter_channels": 256,
+    "duration_predictor_flow_bins": 10,
+    "duration_predictor_kernel_size": 3,
+    "duration_predictor_num_flows": 4,
+    "duration_predictor_tail_bound": 5.0,
+    "ffn_dim": 768,
+    "ffn_kernel_size": 3,
+    "flow_size": 192,
+    "hidden_act": "relu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 192,
+    "initializer_range": 0.02,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.1,
+    "leaky_relu_slope": 0.1,
+    "model_type": "vits",
+    "noise_scale": 0.667,
+    "noise_scale_duration": 0.8,
+    "num_attention_heads": 2,
+    "num_hidden_layers": 6,
+    "num_speakers": 1,
+    "posterior_encoder_num_wavenet_layers": 16,
+    "prior_encoder_num_flows": 4,
+    "prior_encoder_num_wavenet_layers": 4,
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
     ],
+    "resblock_kernel_sizes": [
       3,
+      7,
+      11
+    ],
+    "sampling_rate": 16000,
+    "speaker_embedding_size": 0,
+    "speaking_rate": 1.0,
+    "spectrogram_bins": 513,
+    "torch_dtype": "float32",
+    "transformers_version": "4.33.0.dev0",
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2
+    ],
+    "use_bias": true,
+    "use_stochastic_duration_prediction": true,
+    "vocab_size": 53,
+    "wavenet_dilation_rate": 1,
+    "wavenet_dropout": 0.0,
+    "wavenet_kernel_size": 5,
+    "window_size": 4
+  }

tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,12 @@
 {
-  "model_type": "ttsmms",
-  "tokenizer_class": "TTSMMSTokenizer",
-  "pad_token": "_",
-  "unk_token": "_",
-  "do_lower_case": true
-}

 {
+    "add_blank": true,
+    "clean_up_tokenization_spaces": true,
+    "is_uroman": false,
+    "language": "grn",
+    "model_max_length": 1000000000000000019884624838656,
+    "normalize": true,
+    "pad_token": "3",
+    "phonemize": false,
+    "tokenizer_class": "VitsTokenizer",
+    "unk_token": "<unk>"
+  }

vocab.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+    " ": 11,
+    "'": 46,
+    "-": 37,
+    "0": 24,
+    "1": 23,
+    "2": 42,
+    "3": 0,
+    "4": 21,
+    "5": 41,
+    "6": 8,
+    "7": 16,
+    "8": 9,
+    "9": 26,
+    "_": 35,
+    "a": 10,
+    "b": 22,
+    "c": 29,
+    "d": 15,
+    "e": 38,
+    "f": 27,
+    "g": 45,
+    "h": 6,
+    "i": 43,
+    "j": 30,
+    "k": 7,
+    "l": 3,
+    "m": 2,
+    "n": 18,
+    "o": 50,
+    "p": 17,
+    "q": 19,
+    "r": 36,
+    "s": 52,
+    "t": 40,
+    "u": 39,
+    "v": 13,
+    "x": 51,
+    "y": 31,
+    "z": 47,
+    "á": 44,
+    "ã": 20,
+    "é": 34,
+    "í": 33,
+    "ñ": 1,
+    "ó": 32,
+    "õ": 48,
+    "ú": 25,
+    "ý": 14,
+    "ĩ": 49,
+    "ũ": 5,
+    "ẽ": 12,
+    "ỹ": 4,
+    "—": 28
+  }