Y Phung Nguyen commited on
Commit
4592ab0
·
1 Parent(s): 2fffb9d

Fix codecodec error

Browse files
Files changed (1) hide show
  1. voice.py +35 -10
voice.py CHANGED
@@ -4,6 +4,7 @@ import asyncio
4
  import tempfile
5
  import soundfile as sf
6
  import torch
 
7
  from logger import logger
8
  from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
9
  import config
@@ -104,12 +105,8 @@ def transcribe_audio_whisper(audio_path: str) -> str:
104
  model = config.global_whisper_model["model"]
105
 
106
  logger.info("[ASR] Loading audio file...")
107
- # Load audio using torchaudio (imported from models)
108
- from models import torchaudio
109
  import torch
110
- if torchaudio is None:
111
- logger.error("[ASR] torchaudio not available")
112
- return ""
113
 
114
  # Check if audio file exists
115
  if not os.path.exists(audio_path):
@@ -117,8 +114,20 @@ def transcribe_audio_whisper(audio_path: str) -> str:
117
  return ""
118
 
119
  try:
120
- waveform, sample_rate = torchaudio.load(audio_path)
121
- logger.info(f"[ASR] Loaded audio: shape={waveform.shape}, sample_rate={sample_rate}")
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  # Ensure audio is mono (single channel)
124
  if waveform.shape[0] > 1:
@@ -128,9 +137,25 @@ def transcribe_audio_whisper(audio_path: str) -> str:
128
  # Resample to 16kHz if needed (Whisper expects 16kHz)
129
  if sample_rate != 16000:
130
  logger.info(f"[ASR] Resampling from {sample_rate}Hz to 16000Hz")
131
- resampler = torchaudio.transforms.Resample(sample_rate, 16000)
132
- waveform = resampler(waveform)
133
- sample_rate = 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  logger.info(f"[ASR] Audio ready: shape={waveform.shape}, sample_rate={sample_rate}")
136
 
 
4
  import tempfile
5
  import soundfile as sf
6
  import torch
7
+ import numpy as np
8
  from logger import logger
9
  from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
10
  import config
 
105
  model = config.global_whisper_model["model"]
106
 
107
  logger.info("[ASR] Loading audio file...")
 
 
108
  import torch
109
+ import numpy as np
 
 
110
 
111
  # Check if audio file exists
112
  if not os.path.exists(audio_path):
 
114
  return ""
115
 
116
  try:
117
+ # Use soundfile to load audio (more reliable, doesn't require torchcodec)
118
+ logger.info(f"[ASR] Loading audio with soundfile: {audio_path}")
119
+ audio_data, sample_rate = sf.read(audio_path, dtype='float32')
120
+ logger.info(f"[ASR] Loaded audio with soundfile: shape={audio_data.shape}, sample_rate={sample_rate}, dtype={audio_data.dtype}")
121
+
122
+ # Convert to torch tensor and ensure it's 2D (channels, samples)
123
+ if len(audio_data.shape) == 1:
124
+ # Mono audio - add channel dimension
125
+ waveform = torch.from_numpy(audio_data).unsqueeze(0)
126
+ else:
127
+ # Multi-channel - transpose to (channels, samples)
128
+ waveform = torch.from_numpy(audio_data).T
129
+
130
+ logger.info(f"[ASR] Converted to tensor: shape={waveform.shape}, dtype={waveform.dtype}")
131
 
132
  # Ensure audio is mono (single channel)
133
  if waveform.shape[0] > 1:
 
137
  # Resample to 16kHz if needed (Whisper expects 16kHz)
138
  if sample_rate != 16000:
139
  logger.info(f"[ASR] Resampling from {sample_rate}Hz to 16000Hz")
140
+ # Use scipy or librosa for resampling if available, otherwise use simple interpolation
141
+ try:
142
+ from scipy import signal
143
+ # Resample using scipy
144
+ num_samples = int(len(waveform[0]) * 16000 / sample_rate)
145
+ resampled = signal.resample(waveform[0].numpy(), num_samples)
146
+ waveform = torch.from_numpy(resampled).unsqueeze(0)
147
+ sample_rate = 16000
148
+ logger.info(f"[ASR] Resampled using scipy: new shape={waveform.shape}")
149
+ except ImportError:
150
+ # Fallback: simple linear interpolation (scipy not available)
151
+ logger.info("[ASR] scipy not available, using simple linear interpolation for resampling")
152
+ num_samples = int(len(waveform[0]) * 16000 / sample_rate)
153
+ waveform_1d = waveform[0].numpy()
154
+ indices = np.linspace(0, len(waveform_1d) - 1, num_samples)
155
+ resampled = np.interp(indices, np.arange(len(waveform_1d)), waveform_1d)
156
+ waveform = torch.from_numpy(resampled).unsqueeze(0)
157
+ sample_rate = 16000
158
+ logger.info(f"[ASR] Resampled using simple interpolation: new shape={waveform.shape}")
159
 
160
  logger.info(f"[ASR] Audio ready: shape={waveform.shape}, sample_rate={sample_rate}")
161