Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 21 days ago

Commit

4592ab0

1 Parent(s): 2fffb9d

Fix codecodec error

Browse files

Files changed (1) hide show

voice.py +35 -10

voice.py CHANGED Viewed

@@ -4,6 +4,7 @@ import asyncio
 import tempfile
 import soundfile as sf
 import torch
 from logger import logger
 from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
 import config
@@ -104,12 +105,8 @@ def transcribe_audio_whisper(audio_path: str) -> str:
         model = config.global_whisper_model["model"]
         logger.info("[ASR] Loading audio file...")
-        # Load audio using torchaudio (imported from models)
-        from models import torchaudio
         import torch
-        if torchaudio is None:
-            logger.error("[ASR] torchaudio not available")
-            return ""
         # Check if audio file exists
         if not os.path.exists(audio_path):
@@ -117,8 +114,20 @@ def transcribe_audio_whisper(audio_path: str) -> str:
             return ""
         try:
-            waveform, sample_rate = torchaudio.load(audio_path)
-            logger.info(f"[ASR] Loaded audio: shape={waveform.shape}, sample_rate={sample_rate}")
             # Ensure audio is mono (single channel)
             if waveform.shape[0] > 1:
@@ -128,9 +137,25 @@ def transcribe_audio_whisper(audio_path: str) -> str:
             # Resample to 16kHz if needed (Whisper expects 16kHz)
             if sample_rate != 16000:
                 logger.info(f"[ASR] Resampling from {sample_rate}Hz to 16000Hz")
-                resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-                waveform = resampler(waveform)
-                sample_rate = 16000
             logger.info(f"[ASR] Audio ready: shape={waveform.shape}, sample_rate={sample_rate}")

 import tempfile
 import soundfile as sf
 import torch
+import numpy as np
 from logger import logger
 from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
 import config
         model = config.global_whisper_model["model"]
         logger.info("[ASR] Loading audio file...")
         import torch
+        import numpy as np
         # Check if audio file exists
         if not os.path.exists(audio_path):
             return ""
         try:
+            # Use soundfile to load audio (more reliable, doesn't require torchcodec)
+            logger.info(f"[ASR] Loading audio with soundfile: {audio_path}")
+            audio_data, sample_rate = sf.read(audio_path, dtype='float32')
+            logger.info(f"[ASR] Loaded audio with soundfile: shape={audio_data.shape}, sample_rate={sample_rate}, dtype={audio_data.dtype}")
+            # Convert to torch tensor and ensure it's 2D (channels, samples)
+            if len(audio_data.shape) == 1:
+                # Mono audio - add channel dimension
+                waveform = torch.from_numpy(audio_data).unsqueeze(0)
+            else:
+                # Multi-channel - transpose to (channels, samples)
+                waveform = torch.from_numpy(audio_data).T
+            logger.info(f"[ASR] Converted to tensor: shape={waveform.shape}, dtype={waveform.dtype}")
             # Ensure audio is mono (single channel)
             if waveform.shape[0] > 1:
             # Resample to 16kHz if needed (Whisper expects 16kHz)
             if sample_rate != 16000:
                 logger.info(f"[ASR] Resampling from {sample_rate}Hz to 16000Hz")
+                # Use scipy or librosa for resampling if available, otherwise use simple interpolation
+                try:
+                    from scipy import signal
+                    # Resample using scipy
+                    num_samples = int(len(waveform[0]) * 16000 / sample_rate)
+                    resampled = signal.resample(waveform[0].numpy(), num_samples)
+                    waveform = torch.from_numpy(resampled).unsqueeze(0)
+                    sample_rate = 16000
+                    logger.info(f"[ASR] Resampled using scipy: new shape={waveform.shape}")
+                except ImportError:
+                    # Fallback: simple linear interpolation (scipy not available)
+                    logger.info("[ASR] scipy not available, using simple linear interpolation for resampling")
+                    num_samples = int(len(waveform[0]) * 16000 / sample_rate)
+                    waveform_1d = waveform[0].numpy()
+                    indices = np.linspace(0, len(waveform_1d) - 1, num_samples)
+                    resampled = np.interp(indices, np.arange(len(waveform_1d)), waveform_1d)
+                    waveform = torch.from_numpy(resampled).unsqueeze(0)
+                    sample_rate = 16000
+                    logger.info(f"[ASR] Resampled using simple interpolation: new shape={waveform.shape}")
             logger.info(f"[ASR] Audio ready: shape={waveform.shape}, sample_rate={sample_rate}")