Commit
·
7ca3b80
1
Parent(s):
0afa761
Update Tacotron2 French model
Browse files- README.md +93 -0
- config.yml +86 -0
- model.h5 +3 -0
- processor.json +1 -0
README.md
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- tensorflowtts
|
| 4 |
+
- audio
|
| 5 |
+
- text-to-speech
|
| 6 |
+
- text-to-mel
|
| 7 |
+
language: fr
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
datasets:
|
| 10 |
+
- synpaflex
|
| 11 |
+
widget:
|
| 12 |
+
- text: "Oh, je voudrais tant que tu te souviennes Des jours heureux quand nous étions amis"
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Tacotron 2 with Guided Attention trained on Synpaflex (Fr)
|
| 16 |
+
This repository provides a pretrained [Tacotron2](https://arxiv.org/abs/1712.05884) trained with [Guided Attention](https://arxiv.org/abs/1710.08969) on Synpaflex dataset (Fr). For a detail of the model, we encourage you to read more about
|
| 17 |
+
[TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS).
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
## Install TensorFlowTTS
|
| 21 |
+
First of all, please install TensorFlowTTS with the following command:
|
| 22 |
+
```
|
| 23 |
+
pip install TensorFlowTTS
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Converting your Text to Mel Spectrogram
|
| 27 |
+
```python
|
| 28 |
+
import numpy as np
|
| 29 |
+
import soundfile as sf
|
| 30 |
+
import yaml
|
| 31 |
+
|
| 32 |
+
import tensorflow as tf
|
| 33 |
+
|
| 34 |
+
from tensorflow_tts.inference import AutoProcessor
|
| 35 |
+
from tensorflow_tts.inference import TFAutoModel
|
| 36 |
+
|
| 37 |
+
processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-synpaflex-fr")
|
| 38 |
+
tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-synpaflex-fr")
|
| 39 |
+
|
| 40 |
+
text = "Oh, je voudrais tant que tu te souviennes Des jours heureux quand nous étions amis"
|
| 41 |
+
|
| 42 |
+
input_ids = processor.text_to_sequence(text)
|
| 43 |
+
|
| 44 |
+
decoder_output, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference(
|
| 45 |
+
input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
|
| 46 |
+
input_lengths=tf.convert_to_tensor([len(input_ids)], tf.int32),
|
| 47 |
+
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
#### Referencing Tacotron 2
|
| 53 |
+
```
|
| 54 |
+
@article{DBLP:journals/corr/abs-1712-05884,
|
| 55 |
+
author = {Jonathan Shen and
|
| 56 |
+
Ruoming Pang and
|
| 57 |
+
Ron J. Weiss and
|
| 58 |
+
Mike Schuster and
|
| 59 |
+
Navdeep Jaitly and
|
| 60 |
+
Zongheng Yang and
|
| 61 |
+
Zhifeng Chen and
|
| 62 |
+
Yu Zhang and
|
| 63 |
+
Yuxuan Wang and
|
| 64 |
+
R. J. Skerry{-}Ryan and
|
| 65 |
+
Rif A. Saurous and
|
| 66 |
+
Yannis Agiomyrgiannakis and
|
| 67 |
+
Yonghui Wu},
|
| 68 |
+
title = {Natural {TTS} Synthesis by Conditioning WaveNet on Mel Spectrogram
|
| 69 |
+
Predictions},
|
| 70 |
+
journal = {CoRR},
|
| 71 |
+
volume = {abs/1712.05884},
|
| 72 |
+
year = {2017},
|
| 73 |
+
url = {http://arxiv.org/abs/1712.05884},
|
| 74 |
+
archivePrefix = {arXiv},
|
| 75 |
+
eprint = {1712.05884},
|
| 76 |
+
timestamp = {Thu, 28 Nov 2019 08:59:52 +0100},
|
| 77 |
+
biburl = {https://dblp.org/rec/journals/corr/abs-1712-05884.bib},
|
| 78 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
| 79 |
+
}
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
#### Referencing TensorFlowTTS
|
| 83 |
+
```
|
| 84 |
+
@misc{TFTTS,
|
| 85 |
+
author = {Minh Nguyen, Alejandro Miguel Velasquez, Erogol, Kuan Chen, Dawid Kobus, Takuya Ebata,
|
| 86 |
+
Trinh Le and Yunchao He},
|
| 87 |
+
title = {TensorflowTTS},
|
| 88 |
+
year = {2020},
|
| 89 |
+
publisher = {GitHub},
|
| 90 |
+
journal = {GitHub repository},
|
| 91 |
+
howpublished = {\\url{https://github.com/TensorSpeech/TensorFlowTTS}},
|
| 92 |
+
}
|
| 93 |
+
```
|
config.yml
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This is the hyperparameter configuration file for Tacotron2 v1.
|
| 2 |
+
# Please make sure this is adjusted for the synpaflex dataset. If you want to
|
| 3 |
+
# apply to the other dataset, you might need to carefully change some parameters.
|
| 4 |
+
# This configuration performs 200k iters but 65k iters is enough to get a good models.
|
| 5 |
+
|
| 6 |
+
###########################################################
|
| 7 |
+
# FEATURE EXTRACTION SETTING #
|
| 8 |
+
###########################################################
|
| 9 |
+
hop_size: 256 # Hop size.
|
| 10 |
+
format: "npy"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
###########################################################
|
| 14 |
+
# NETWORK ARCHITECTURE SETTING #
|
| 15 |
+
###########################################################
|
| 16 |
+
model_type: "tacotron2"
|
| 17 |
+
|
| 18 |
+
tacotron2_params:
|
| 19 |
+
dataset: synpaflex
|
| 20 |
+
embedding_hidden_size: 512
|
| 21 |
+
initializer_range: 0.02
|
| 22 |
+
embedding_dropout_prob: 0.1
|
| 23 |
+
n_speakers: 1
|
| 24 |
+
n_conv_encoder: 5
|
| 25 |
+
encoder_conv_filters: 512
|
| 26 |
+
encoder_conv_kernel_sizes: 5
|
| 27 |
+
encoder_conv_activation: 'relu'
|
| 28 |
+
encoder_conv_dropout_rate: 0.5
|
| 29 |
+
encoder_lstm_units: 256
|
| 30 |
+
n_prenet_layers: 2
|
| 31 |
+
prenet_units: 256
|
| 32 |
+
prenet_activation: 'relu'
|
| 33 |
+
prenet_dropout_rate: 0.5
|
| 34 |
+
n_lstm_decoder: 1
|
| 35 |
+
reduction_factor: 1
|
| 36 |
+
decoder_lstm_units: 1024
|
| 37 |
+
attention_dim: 128
|
| 38 |
+
attention_filters: 32
|
| 39 |
+
attention_kernel: 31
|
| 40 |
+
n_mels: 80
|
| 41 |
+
n_conv_postnet: 5
|
| 42 |
+
postnet_conv_filters: 512
|
| 43 |
+
postnet_conv_kernel_sizes: 5
|
| 44 |
+
postnet_dropout_rate: 0.1
|
| 45 |
+
attention_type: "lsa"
|
| 46 |
+
|
| 47 |
+
###########################################################
|
| 48 |
+
# DATA LOADER SETTING #
|
| 49 |
+
###########################################################
|
| 50 |
+
batch_size: 32 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
|
| 51 |
+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
|
| 52 |
+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
|
| 53 |
+
mel_length_threshold: 32 # remove all targets has mel_length <= 32
|
| 54 |
+
is_shuffle: true # shuffle dataset after each epoch.
|
| 55 |
+
use_fixed_shapes: true # use_fixed_shapes for training (2x speed-up)
|
| 56 |
+
# refer (https://github.com/dathudeptrai/TensorflowTTS/issues/34#issuecomment-642309118)
|
| 57 |
+
|
| 58 |
+
###########################################################
|
| 59 |
+
# OPTIMIZER & SCHEDULER SETTING #
|
| 60 |
+
###########################################################
|
| 61 |
+
optimizer_params:
|
| 62 |
+
initial_learning_rate: 0.001
|
| 63 |
+
end_learning_rate: 0.00001
|
| 64 |
+
decay_steps: 150000 # < train_max_steps is recommend.
|
| 65 |
+
warmup_proportion: 0.02
|
| 66 |
+
weight_decay: 0.001
|
| 67 |
+
|
| 68 |
+
gradient_accumulation_steps: 1
|
| 69 |
+
var_train_expr: null # trainable variable expr (eg. 'embeddings|decoder_cell' )
|
| 70 |
+
# must separate by |. if var_train_expr is null then we
|
| 71 |
+
# training all variables.
|
| 72 |
+
###########################################################
|
| 73 |
+
# INTERVAL SETTING #
|
| 74 |
+
###########################################################
|
| 75 |
+
train_max_steps: 200000 # Number of training steps.
|
| 76 |
+
save_interval_steps: 2000 # Interval steps to save checkpoint.
|
| 77 |
+
eval_interval_steps: 500 # Interval steps to evaluate the network.
|
| 78 |
+
log_interval_steps: 200 # Interval steps to record the training log.
|
| 79 |
+
start_schedule_teacher_forcing: 200001 # don't need to apply schedule teacher forcing.
|
| 80 |
+
start_ratio_value: 0.5 # start ratio of scheduled teacher forcing.
|
| 81 |
+
schedule_decay_steps: 50000 # decay step scheduled teacher forcing.
|
| 82 |
+
end_ratio_value: 0.0 # end ratio of scheduled teacher forcing.
|
| 83 |
+
###########################################################
|
| 84 |
+
# OTHER SETTING #
|
| 85 |
+
###########################################################
|
| 86 |
+
num_save_intermediate_results: 1 # Number of results to be saved as intermediate results.
|
model.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7761e61d0dd3bbe9387ff6191d1507d9fd308d6117c8d3ec2f8151c6f9ea4470
|
| 3 |
+
size 127842184
|
processor.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"symbol_to_id": {"pad": 0, "!": 1, "/": 2, "'": 3, "(": 4, ")": 5, ",": 6, "-": 7, ".": 8, ":": 9, ";": 10, "?": 11, " ": 12, "A": 13, "B": 14, "C": 15, "D": 16, "E": 17, "F": 18, "G": 19, "H": 20, "I": 21, "J": 22, "K": 23, "L": 24, "M": 25, "N": 26, "O": 27, "P": 28, "Q": 29, "R": 30, "S": 31, "T": 32, "U": 33, "V": 34, "W": 35, "X": 36, "Y": 37, "Z": 38, "a": 39, "b": 40, "c": 41, "d": 42, "e": 43, "f": 44, "g": 45, "h": 46, "i": 47, "j": 48, "k": 49, "l": 50, "m": 51, "n": 52, "o": 53, "p": 54, "q": 55, "r": 56, "s": 57, "t": 58, "u": 59, "v": 60, "w": 61, "x": 62, "y": 63, "z": 64, "\u00e9": 65, "\u00e8": 66, "\u00e0": 67, "\u00f9": 68, "\u00e2": 69, "\u00ea": 70, "\u00ee": 71, "\u00f4": 72, "\u00fb": 73, "\u00e7": 74, "\u00e4": 75, "\u00eb": 76, "\u00ef": 77, "\u00f6": 78, "\u00fc": 79, "\u00ff": 80, "\u0153": 81, "\u00e6": 82, "eos": 83}, "id_to_symbol": {"0": "pad", "1": "!", "2": "/", "3": "'", "4": "(", "5": ")", "6": ",", "7": "-", "8": ".", "9": ":", "10": ";", "11": "?", "12": " ", "13": "A", "14": "B", "15": "C", "16": "D", "17": "E", "18": "F", "19": "G", "20": "H", "21": "I", "22": "J", "23": "K", "24": "L", "25": "M", "26": "N", "27": "O", "28": "P", "29": "Q", "30": "R", "31": "S", "32": "T", "33": "U", "34": "V", "35": "W", "36": "X", "37": "Y", "38": "Z", "39": "a", "40": "b", "41": "c", "42": "d", "43": "e", "44": "f", "45": "g", "46": "h", "47": "i", "48": "j", "49": "k", "50": "l", "51": "m", "52": "n", "53": "o", "54": "p", "55": "q", "56": "r", "57": "s", "58": "t", "59": "u", "60": "v", "61": "w", "62": "x", "63": "y", "64": "z", "65": "\u00e9", "66": "\u00e8", "67": "\u00e0", "68": "\u00f9", "69": "\u00e2", "70": "\u00ea", "71": "\u00ee", "72": "\u00f4", "73": "\u00fb", "74": "\u00e7", "75": "\u00e4", "76": "\u00eb", "77": "\u00ef", "78": "\u00f6", "79": "\u00fc", "80": "\u00ff", "81": "\u0153", "82": "\u00e6", "83": "eos"}, "speakers_map": {"synpaflex": 0}, "processor_name": "SynpaflexProcessor"}
|