Spaces:
Sleeping
Sleeping
| """ | |
| Description: | |
| This script presents a Gradio demo for the ASR/ST FAMA models developed at FBK | |
| Dependencies: | |
| all the necessary dependencies are listed in requirements.txt | |
| Usage: | |
| The demo can be runned locally by installing all necessary dependencies in a python virtual env or it can be run in an HuggingFace Space | |
| Author: Lorenzo Concina | |
| Date: 4/6/2025 | |
| """ | |
| import os | |
| import torch | |
| import librosa as lb | |
| import gradio as gr | |
| from transformers import AutoProcessor, pipeline | |
| from datasets import load_dataset | |
| def load_fama(model_id, output_lang): | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| tgt_lang = "it" | |
| # Force the model to start with the language tag | |
| lang_tag = "<lang:{}>".format(output_lang) | |
| lang_tag_id = processor.tokenizer.convert_tokens_to_ids(lang_tag) | |
| generate_kwargs = {"num_beams": 5, "no_repeat_ngram_size": 5, "forced_bos_token_id": lang_tag_id} | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model_id, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float32, | |
| device=device, | |
| return_timestamps=False, | |
| generate_kwargs=generate_kwargs | |
| ) | |
| return pipe | |
| def load_audio_file(audio_path): | |
| y, sr = lb.load(audio_path, sr=16000, mono=True) | |
| return y | |
| def transcribe(audio, task_type, model_id, output_lang): | |
| """ | |
| Function called by gradio interface. It runs model inference on an audio sample | |
| """ | |
| cache_key = (model_id, output_lang) | |
| if cache_key not in model_cache: | |
| model_cache[cache_key] = load_fama(model_id, output_lang) | |
| pipeline = model_cache[cache_key] | |
| if isinstance(audio, str) and os.path.isfile(audio): | |
| #load the audio with Librosa | |
| utterance = load_audio_file(audio) | |
| result = pipeline(utterance) | |
| else: | |
| #user used the mic | |
| result = pipeline(audio) | |
| return result["text"] | |
| #available models | |
| def update_model_options(task_type): | |
| if task_type == "ST": | |
| return gr.update(choices=["FBK-MT/fama-small", "FBK-MT/fama-medium"], value="FBK-MT/fama-small") | |
| else: | |
| return gr.update(choices=[ | |
| "FBK-MT/fama-small", | |
| "FBK-MT/fama-medium", | |
| "FBK-MT/fama-small-asr", | |
| "FBK-MT/fama-medium-asr" | |
| ], value="FBK-MT/fama-small") | |
| # Language options (languages supported by FAMA models) | |
| language_choices = ["en", "it"] | |
| # Cache loaded models to avoid reloading | |
| model_cache = {} | |
| if __name__ == "__main__": | |
| with gr.Blocks() as iface: | |
| gr.Markdown("""## FAMA ASR and ST\nSimple Automatic Speech Recognition and Speech Translation demo powered by FAMA models, developed at FBK. \ | |
| More informations about FAMA models can be found here: https://huggingface.co/collections/FBK-MT/fama-683425df3fb2b3171e0cdc9e""") | |
| with gr.Row(): | |
| audio_input = gr.Audio(type="filepath", label="Upload or record audio") | |
| task_type_input = gr.Radio(choices=["ASR", "ST"], value="ASR", label="Select task type") | |
| model_input = gr.Radio(choices=[ | |
| "FBK-MT/fama-small", | |
| "FBK-MT/fama-medium", | |
| "FBK-MT/fama-small-asr", | |
| "FBK-MT/fama-medium-asr" | |
| ], value="FBK-MT/fama-small", label="Select a FAMA model") | |
| lang_input = gr.Dropdown(choices=language_choices, value="it", label="Transcription language") | |
| output = gr.Textbox(label="Transcription") | |
| task_type_input.change(fn=update_model_options, inputs=task_type_input, outputs=model_input) | |
| transcribe_btn = gr.Button("Transcribe") | |
| transcribe_btn.click(fn=transcribe, inputs=[audio_input, task_type_input, model_input, lang_input], outputs=output) | |
| iface.launch() | |