import os import subprocess import streamlit as st import librosa import librosa.display import numpy as np import matplotlib.pyplot as plt import soundfile as sf import wave import json from vosk import Model, KaldiRecognizer from transformers import pipeline from huggingface_hub import snapshot_download from pydub import AudioSegment import noisereduce as nr # 🎨 Apply Custom CSS Styling st.markdown( """ """, unsafe_allow_html=True ) # ✅ Auto-Download Vosk Model (Speech-to-Text) VOSK_MODEL = "vosk-model-small-en-us-0.15" if not os.path.exists(VOSK_MODEL): st.write("📥 Downloading Vosk Model...") subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"]) subprocess.run(["unzip", "vosk.zip"]) subprocess.run(["rm", "vosk.zip"]) # Load Vosk model model = Model(VOSK_MODEL) # ✅ Auto-Download Wav2Vec2 Model (Emotion Detection) WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53" if not os.path.exists(WAV2VEC_MODEL): st.write(f"📥 Downloading {WAV2VEC_MODEL}...") snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL) # Load emotion detection model emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL) # ✅ Streamlit UI st.markdown("
🎙️ Speech Detection System
", unsafe_allow_html=True) st.markdown("
🔍 Upload an audio file for speech-to-text, noise filtering, and emotion analysis.
", unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"]) if uploaded_file: # Convert MP3 to WAV if needed file_path = f"temp/{uploaded_file.name}" os.makedirs("temp", exist_ok=True) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) if file_path.endswith(".mp3"): wav_path = file_path.replace(".mp3", ".wav") audio = AudioSegment.from_mp3(file_path) audio.export(wav_path, format="wav") file_path = wav_path # Load audio y, sr = librosa.load(file_path, sr=16000) # 🎵 Display waveform st.markdown("
🎼 Audio Waveform:
", unsafe_allow_html=True) fig, ax = plt.subplots(figsize=(10, 4)) librosa.display.waveshow(y, sr=sr, ax=ax) st.pyplot(fig) # ✅ Noise Reduction st.markdown("
🔇 Applying Noise Reduction...
", unsafe_allow_html=True) y_denoised = nr.reduce_noise(y=y, sr=sr) denoised_path = file_path.replace(".wav", "_denoised.wav") sf.write(denoised_path, y_denoised, sr) # ✅ Speech-to-Text using Vosk def transcribe_audio(audio_path): wf = wave.open(audio_path, "rb") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = json.loads(rec.Result()) return result["text"] transcription = transcribe_audio(file_path) st.markdown("
📝 Transcribed Text:
", unsafe_allow_html=True) st.markdown(f"
{transcription}
", unsafe_allow_html=True) # ✅ Emotion Detection st.markdown("
😊 Emotion Analysis:
", unsafe_allow_html=True) emotion_result = emotion_model(file_path) st.write(emotion_result) # ✅ Play Original & Denoised Audio st.markdown("
🔊 Play Audio:
", unsafe_allow_html=True) st.audio(file_path, format="audio/wav", start_time=0) st.markdown("
🔇 Denoised Audio:
", unsafe_allow_html=True) st.audio(denoised_path, format="audio/wav", start_time=0)