speech-to-speech-translation

Runtime error

App Files Files Community

magnustragardh commited on Jul 17, 2023

Commit

c399ca6

1 Parent(s): 3be9059

Attempt to use German.

Browse files

Files changed (1) hide show

app.py +22 -6

app.py CHANGED Viewed

@@ -8,11 +8,26 @@ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Proce
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
-tts_model = "sanchit-gandhi/speecht5_tts_vox_nl"
 processor = SpeechT5Processor.from_pretrained(tts_model)
 model = SpeechT5ForTextToSpeech.from_pretrained(tts_model).to(device)
@@ -23,13 +38,14 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "nl"})
     return outputs["text"]
 def synthesise(text):
     inputs = processor(text=text, return_tensors="pt")
-    speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
@@ -41,8 +57,8 @@ def speech_to_speech_translation(audio):
 title = "Cascaded STST"
-description = """
-Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
@@ -70,4 +86,4 @@ file_translate = gr.Interface(
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
-demo.launch()

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+language = "de"
 # load speech translation checkpoint
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
+if language == "nl":
+    tts_model = "sanchit-gandhi/speecht5_tts_vox_nl"
+    language_name = "Dutch"
+elif language == "fi":
+    tts_model = "crcdng/speecht5_finetuned_voxpopuli_fi"
+    language_name = "Finnish"
+elif language == "fr":
+    tts_model = "Sandiago21/speecht5_finetuned_facebook_voxpopuli_french"
+    language_name = "French"
+elif language == "de":
+    tts_model = "Salama1429/TTS_German_Speecht5_finetuned_voxpopuli_nl"
+    language_name = "German"
+else:
+    raise NotImplementedError(f"No support for language {language}")
 processor = SpeechT5Processor.from_pretrained(tts_model)
 model = SpeechT5ForTextToSpeech.from_pretrained(tts_model).to(device)
 def translate(audio):
+    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": language})
     return outputs["text"]
 def synthesise(text):
     inputs = processor(text=text, return_tensors="pt")
+    max_length = processor.tokenizer.model_max_length
+    speech = model.generate_speech(inputs["input_ids"][:, :max_length].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
 title = "Cascaded STST"
+description = f"""
+Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in {language_name}. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 with demo:
     gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
+demo.launch(debug=True)