Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import torch | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| from TTS.tts.models.xtts import Xtts | |
| from pathlib import Path | |
| import gradio as gr | |
| CONFIG_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/config.json' | |
| VOCAB_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/vocab.json' | |
| MODEL_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/model_2.1.pth' | |
| SPEAKER_AUDIO_URL = 'https://huggingface.co/medmac01/xtt2_darija_v0.1/resolve/main/speaker_reference.wav' | |
| base_path = Path(__file__).parent | |
| # Download the files into the base_path | |
| config_path = base_path / 'config.json' | |
| if not config_path.exists(): | |
| torch.hub.download_url_to_file(CONFIG_URL, config_path) | |
| vocab_path = base_path / 'vocab.json' | |
| if not vocab_path.exists(): | |
| torch.hub.download_url_to_file(VOCAB_URL, vocab_path) | |
| model_path = base_path / 'model.pth' | |
| if not model_path.exists(): | |
| torch.hub.download_url_to_file(MODEL_URL, model_path) | |
| speaker_audio_path = base_path / 'speaker_reference.wav' | |
| if not speaker_audio_path.exists(): | |
| torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path) | |
| config_path = str(config_path) | |
| vocab_path = str(vocab_path) | |
| model_path = str(model_path) | |
| speaker_audio_path = str(speaker_audio_path) | |
| config = XttsConfig() | |
| config.load_json(config_path) | |
| print("Loading model...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(device) | |
| model = Xtts.init_from_config(config) | |
| model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True) | |
| model.to(device) | |
| def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75): | |
| print("Computing speaker latents...") | |
| gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path]) | |
| print("Inference...") | |
| out = model.inference( | |
| text, | |
| "ar", | |
| gpt_cond_latent, | |
| speaker_embedding, | |
| temperature=temperature, | |
| ) | |
| return 24000, out["wav"] | |
| markdown_description = """## Instructions: | |
| 1. Enter the text you want to synthesize. | |
| 2. Upload a 4-5 seconds audio file of the speaker you want to clone. | |
| 3. Click on the "Generate" button. | |
| """ | |
| with gr.Blocks(title="EGTTS") as app: | |
| gr.HTML("<center><h1>Moroccan-Darija-TTS </h1></center>") | |
| gr.Markdown(markdown_description) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text = gr.Textbox(label="Text to synthesize", value="ุงูุณูุงู ุนูููู ูุฑุญู ุฉ ุงููู", rtl=True, text_align="right", lines=3) | |
| speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath") | |
| temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05) | |
| generate_btn = gr.Button(value="Generate", variant="primary") | |
| output = gr.Audio(label="Synthesized audio") | |
| generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output) | |
| app.launch() |