Ethiopia-ASR

Running on Zero

App Files Files Community

badrex commited on Oct 21

Commit

bfe1e2c

verified ·

1 Parent(s): aca2627

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -48

app.py CHANGED Viewed

@@ -1,54 +1,68 @@
-import gradio as gr
-from transformers import pipeline
-import numpy as np
 import os
-from huggingface_hub import login
-import librosa
 import spaces
-HF_TOKEN = os.environ.get("HF_TOKEN")
-if HF_TOKEN:
-    login(token=HF_TOKEN)
-MODEL_ID = "badrex/w2v-bert-2.0-kinyarwanda-asr-1000h"
-transcriber = pipeline("automatic-speech-recognition", model=MODEL_ID)
-@spaces.GPU
-def transcribe(audio):
-    sr, y = audio
-    # convert to mono if stereo
-    if y.ndim > 1:
-        y = y.mean(axis=1)
-    # resample to 16kHz if needed
-    #if sr != 16000:
-    #    y = librosa.resample(y, orig_sr=sr, target_sr=16000)
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    return transcriber({"sampling_rate": sr, "raw": y})["text"]
 examples = []
 examples_dir = "examples"
 if os.path.exists(examples_dir):
     for filename in os.listdir(examples_dir):
         if filename.endswith((".wav", ".mp3", ".ogg")):
             examples.append([os.path.join(examples_dir, filename)])
-    print(f"Found {len(examples)} example files")
-else:
-    print("Examples directory not found")
-demo = gr.Interface(
-    fn=transcribe,
-    inputs=gr.Audio(),
-    outputs="text",
-    title="<div>ASRwanda 🎙️ <br>Speech Recognition for Kinyarwanda</div>",
-    description="""
         <div class="centered-content">
             <div>
                 <p>
@@ -57,23 +71,37 @@ demo = gr.Interface(
                 <br>
                 <p style="font-size: 15px; line-height: 1.8;">
                  Muraho 👋🏼
-                <br>
-                <br>
                  This is a demo for ASRwanda, a Transformer-based automatic speech recognition (ASR) system for Kinyarwanda language.
                  The underlying ASR model was trained on 1000 hours of transcribed speech provided by
                  <a href="https://digitalumuganda.com/" style="color: #2563eb;">Digital Umuganda</a> as part of the Kinyarwanda
                  <a href="https://www.kaggle.com/competitions/kinyarwanda-automatic-speech-recognition-track-b" style="color: #2563eb;"> ASR hackathon</a> on Kaggle.
-                <br>
-                <p style="font-size: 15px; line-height: 1.8;">
                 Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
                 </p>
             </div>
         </div>
-        """,
-    examples=examples if examples else None,
-    cache_examples=False,
-    flagging_mode=None,
-)
 if __name__ == "__main__":
-    demo.launch()

 import os
+import torchaudio
+import gradio as gr
 import spaces
+import torch
+from transformers import AutoProcessor, AutoModelForCTC
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+# load examples
 examples = []
 examples_dir = "examples"
 if os.path.exists(examples_dir):
     for filename in os.listdir(examples_dir):
         if filename.endswith((".wav", ".mp3", ".ogg")):
             examples.append([os.path.join(examples_dir, filename)])
+# Load model and processor
+MODEL_PATH = "badrex/w2v-bert-2.0-kinyarwanda-asr"
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+model = AutoModelForCTC.from_pretrained(MODEL_PATH)
+# move model and processor to device
+model = model.to(device)
+@spaces.GPU()
+def process_audio(audio_path):
+    """Process audio with return the generated response.
+    Args:
+        audio_path: Path to the audio file to be transcribed.
+    Returns:
+        String containing the transcribed text from the audio file, or an error message
+        if the audio file is missing.
+    """
+    if not audio_path:
+        return "Please upload an audio file."
+    # get audio array
+    audio_array, sample_rate = torchaudio.load(audio_path)
+    # if sample rate is not 16000, resample to 16000
+    if sample_rate != 16000:
+        audio_array = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio_array)
+    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    outputs = torch.argmax(logits, dim=-1)
+    decoded_outputs = processor.batch_decode(
+        outputs,
+        skip_special_tokens=True
+    )
+    return decoded_outputs[0].strip()
+# Define Gradio interface
+with gr.Blocks(title="<div>ASRwanda 🎙️ <br>Speech Recognition for Kinyarwanda</div>") as demo:
+    gr.Markdown("""
         <div class="centered-content">
             <div>
                 <p>
                 <br>
                 <p style="font-size: 15px; line-height: 1.8;">
                  Muraho 👋🏼
+                <br><br>
                  This is a demo for ASRwanda, a Transformer-based automatic speech recognition (ASR) system for Kinyarwanda language.
                  The underlying ASR model was trained on 1000 hours of transcribed speech provided by
                  <a href="https://digitalumuganda.com/" style="color: #2563eb;">Digital Umuganda</a> as part of the Kinyarwanda
                  <a href="https://www.kaggle.com/competitions/kinyarwanda-automatic-speech-recognition-track-b" style="color: #2563eb;"> ASR hackathon</a> on Kaggle.
+                <br><br>
                 Simply <strong>upload an audio file</strong> 📤 or <strong>record yourself speaking</strong> 🎙️⏺️ to try out the model!
                 </p>
             </div>
         </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(type="filepath", label="Upload Audio")
+            submit_btn = gr.Button("Transcribe Audio", variant="primary")
+        with gr.Column():
+            output_text = gr.Textbox(label="Text Transcription", lines=10)
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=output_text
+    )
+    gr.Examples(
+        examples=examples if examples else None,
+        inputs=[audio_input],
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.queue().launch()