Spaces:
Running
on
Zero
Running
on
Zero
Y Phung Nguyen
commited on
Commit
·
4592ab0
1
Parent(s):
2fffb9d
Fix codecodec error
Browse files
voice.py
CHANGED
|
@@ -4,6 +4,7 @@ import asyncio
|
|
| 4 |
import tempfile
|
| 5 |
import soundfile as sf
|
| 6 |
import torch
|
|
|
|
| 7 |
from logger import logger
|
| 8 |
from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
|
| 9 |
import config
|
|
@@ -104,12 +105,8 @@ def transcribe_audio_whisper(audio_path: str) -> str:
|
|
| 104 |
model = config.global_whisper_model["model"]
|
| 105 |
|
| 106 |
logger.info("[ASR] Loading audio file...")
|
| 107 |
-
# Load audio using torchaudio (imported from models)
|
| 108 |
-
from models import torchaudio
|
| 109 |
import torch
|
| 110 |
-
|
| 111 |
-
logger.error("[ASR] torchaudio not available")
|
| 112 |
-
return ""
|
| 113 |
|
| 114 |
# Check if audio file exists
|
| 115 |
if not os.path.exists(audio_path):
|
|
@@ -117,8 +114,20 @@ def transcribe_audio_whisper(audio_path: str) -> str:
|
|
| 117 |
return ""
|
| 118 |
|
| 119 |
try:
|
| 120 |
-
|
| 121 |
-
logger.info(f"[ASR]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
# Ensure audio is mono (single channel)
|
| 124 |
if waveform.shape[0] > 1:
|
|
@@ -128,9 +137,25 @@ def transcribe_audio_whisper(audio_path: str) -> str:
|
|
| 128 |
# Resample to 16kHz if needed (Whisper expects 16kHz)
|
| 129 |
if sample_rate != 16000:
|
| 130 |
logger.info(f"[ASR] Resampling from {sample_rate}Hz to 16000Hz")
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
logger.info(f"[ASR] Audio ready: shape={waveform.shape}, sample_rate={sample_rate}")
|
| 136 |
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import soundfile as sf
|
| 6 |
import torch
|
| 7 |
+
import numpy as np
|
| 8 |
from logger import logger
|
| 9 |
from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
|
| 10 |
import config
|
|
|
|
| 105 |
model = config.global_whisper_model["model"]
|
| 106 |
|
| 107 |
logger.info("[ASR] Loading audio file...")
|
|
|
|
|
|
|
| 108 |
import torch
|
| 109 |
+
import numpy as np
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# Check if audio file exists
|
| 112 |
if not os.path.exists(audio_path):
|
|
|
|
| 114 |
return ""
|
| 115 |
|
| 116 |
try:
|
| 117 |
+
# Use soundfile to load audio (more reliable, doesn't require torchcodec)
|
| 118 |
+
logger.info(f"[ASR] Loading audio with soundfile: {audio_path}")
|
| 119 |
+
audio_data, sample_rate = sf.read(audio_path, dtype='float32')
|
| 120 |
+
logger.info(f"[ASR] Loaded audio with soundfile: shape={audio_data.shape}, sample_rate={sample_rate}, dtype={audio_data.dtype}")
|
| 121 |
+
|
| 122 |
+
# Convert to torch tensor and ensure it's 2D (channels, samples)
|
| 123 |
+
if len(audio_data.shape) == 1:
|
| 124 |
+
# Mono audio - add channel dimension
|
| 125 |
+
waveform = torch.from_numpy(audio_data).unsqueeze(0)
|
| 126 |
+
else:
|
| 127 |
+
# Multi-channel - transpose to (channels, samples)
|
| 128 |
+
waveform = torch.from_numpy(audio_data).T
|
| 129 |
+
|
| 130 |
+
logger.info(f"[ASR] Converted to tensor: shape={waveform.shape}, dtype={waveform.dtype}")
|
| 131 |
|
| 132 |
# Ensure audio is mono (single channel)
|
| 133 |
if waveform.shape[0] > 1:
|
|
|
|
| 137 |
# Resample to 16kHz if needed (Whisper expects 16kHz)
|
| 138 |
if sample_rate != 16000:
|
| 139 |
logger.info(f"[ASR] Resampling from {sample_rate}Hz to 16000Hz")
|
| 140 |
+
# Use scipy or librosa for resampling if available, otherwise use simple interpolation
|
| 141 |
+
try:
|
| 142 |
+
from scipy import signal
|
| 143 |
+
# Resample using scipy
|
| 144 |
+
num_samples = int(len(waveform[0]) * 16000 / sample_rate)
|
| 145 |
+
resampled = signal.resample(waveform[0].numpy(), num_samples)
|
| 146 |
+
waveform = torch.from_numpy(resampled).unsqueeze(0)
|
| 147 |
+
sample_rate = 16000
|
| 148 |
+
logger.info(f"[ASR] Resampled using scipy: new shape={waveform.shape}")
|
| 149 |
+
except ImportError:
|
| 150 |
+
# Fallback: simple linear interpolation (scipy not available)
|
| 151 |
+
logger.info("[ASR] scipy not available, using simple linear interpolation for resampling")
|
| 152 |
+
num_samples = int(len(waveform[0]) * 16000 / sample_rate)
|
| 153 |
+
waveform_1d = waveform[0].numpy()
|
| 154 |
+
indices = np.linspace(0, len(waveform_1d) - 1, num_samples)
|
| 155 |
+
resampled = np.interp(indices, np.arange(len(waveform_1d)), waveform_1d)
|
| 156 |
+
waveform = torch.from_numpy(resampled).unsqueeze(0)
|
| 157 |
+
sample_rate = 16000
|
| 158 |
+
logger.info(f"[ASR] Resampled using simple interpolation: new shape={waveform.shape}")
|
| 159 |
|
| 160 |
logger.info(f"[ASR] Audio ready: shape={waveform.shape}, sample_rate={sample_rate}")
|
| 161 |
|