Spaces:

Hatman
/

Audio-Emotion-Recognition

Running on Zero

App Files Files Community

omsandeeppatil commited on Jan 16

Commit

9c11a0a

verified ·

1 Parent(s): 8581f9c

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -43

app.py CHANGED Viewed

@@ -1,61 +1,85 @@
 import gradio as gr
-import spaces ## For ZeroGPU
 import torch
-import torchaudio
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "Hatman/audio-emotion-detection"
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
-def preprocess_audio(audio):
-    waveform, sampling_rate = torchaudio.load(audio)
-    resampled_waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(waveform)
-    return {'speech': resampled_waveform.numpy().flatten(), 'sampling_rate': 16000}
-@spaces.GPU ## For ZeroGPU
-def inference(audio):
-    example = preprocess_audio(audio)
-    inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
-    inputs = {k: v.to('cpu') for k, v in inputs.items()} # Not necessary on ZeroGPU
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    return model.config.id2label[predicted_ids.item()], logits, predicted_ids
-@spaces.GPU ## For ZeroGPU
-def inference_label(audio):
-    example = preprocess_audio(audio)
-    inputs = feature_extractor(example['speech'], sampling_rate=16000, return_tensors="pt", padding=True)
-    inputs = {k: v.to('cpu') for k, v in inputs.items()} # Not necessary on ZeroGPU
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    return model.config.id2label[predicted_ids.item()]
-with gr.Blocks() as demo:
-    gr.Markdown("# Audio Sentiment Analysis")
-    with gr.Tab("Label Only Inference"):
-        gr.Interface(
-            fn=inference_label,
-            inputs=gr.Audio(type="filepath"),
-            outputs=gr.Label(label="Predicted Sentiment"),
-            title="Audio Sentiment Analysis",
-            description="Upload an audio file or record one to get the predicted sentiment label."
         )
-    with gr.Tab("Full Inference"):
-        gr.Interface(
-            fn=inference,
-            inputs=gr.Audio(type="filepath"),
-            outputs=[gr.Label(label="Predicted Sentiment"), gr.Textbox(label="Logits"), gr.Textbox(label="Predicted IDs")],
-            title="Audio Sentiment Analysis (Full)",
-            description="Upload an audio file or record one to analyze sentiment and get detailed results."
         )
-demo.launch(share=True)

 import gradio as gr
 import torch
+import numpy as np
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
+# Initialize model and processor
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model_name = "Hatman/audio-emotion-detection"
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
+model.to(device)
+# Define emotion labels
+EMOTION_LABELS = {
+    0: "angry",
+    1: "disgust",
+    2: "fear",
+    3: "happy",
+    4: "neutral",
+    5: "sad",
+    6: "surprise"
+}
+def process_audio(audio):
+    """Process audio chunk and return emotion"""
+    if audio is None:
+        return ""
+    # Get the audio data
+    if isinstance(audio, tuple):
+        audio = audio[1]
+    # Convert to numpy array if needed
+    audio = np.array(audio)
+    # Ensure we have mono audio
+    if len(audio.shape) > 1:
+        audio = audio.mean(axis=1)
+    try:
+        # Prepare input for the model
+        inputs = feature_extractor(
+            audio,
+            sampling_rate=16000,
+            return_tensors="pt",
+            padding=True
         )
+        # Move to appropriate device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Get prediction
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+            predicted_id = torch.argmax(logits, dim=-1).item()
+        emotion = EMOTION_LABELS[predicted_id]
+        return emotion
+    except Exception as e:
+        print(f"Error processing audio: {e}")
+        return "Error processing audio"
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_audio,
+    inputs=[
+        gr.Audio(
+            sources=["microphone"],
+            type="numpy",
+            streaming=True,
+            label="Speak into your microphone",
+            show_label=True
         )
+    ],
+    outputs=gr.Textbox(label="Detected Emotion"),
+    title="Live Emotion Detection",
+    description="Speak into your microphone to detect emotions in real-time.",
+    live=True,
+    allow_flagging=False
+)
+# Launch with a small queue for better real-time performance
+demo.queue(max_size=1).launch(share=True)