Spaces:
Running
Running
Update speech_edit.py
Browse files- speech_edit.py +41 -6
speech_edit.py
CHANGED
|
@@ -8,7 +8,8 @@ CosyVoice gRPC back‑end – updated to mirror the FastAPI logic
|
|
| 8 |
* inference_instruct2 ➜ new: prompt‑audio + speed (no speaker‑ID)
|
| 9 |
"""
|
| 10 |
|
| 11 |
-
import io,
|
|
|
|
| 12 |
import sys
|
| 13 |
from concurrent import futures
|
| 14 |
import argparse
|
|
@@ -148,14 +149,48 @@ class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
|
|
| 148 |
if request.HasField("cross_lingual_request"):
|
| 149 |
logging.info("Received cross‑lingual inference request")
|
| 150 |
cr = request.cross_lingual_request
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
yield from _yield_audio(mo)
|
| 157 |
return
|
| 158 |
|
|
|
|
| 159 |
# 4. Instruction‑TTS (two flavours)
|
| 160 |
if request.HasField("instruct_request"):
|
| 161 |
ir = request.instruct_request
|
|
|
|
| 8 |
* inference_instruct2 ➜ new: prompt‑audio + speed (no speaker‑ID)
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
import io, tempfile, requests, soundfile as sf, torchaudio
|
| 12 |
+
import os
|
| 13 |
import sys
|
| 14 |
from concurrent import futures
|
| 15 |
import argparse
|
|
|
|
| 149 |
if request.HasField("cross_lingual_request"):
|
| 150 |
logging.info("Received cross‑lingual inference request")
|
| 151 |
cr = request.cross_lingual_request
|
| 152 |
+
tmp_path = None
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
if cr.prompt_audio.startswith(b'http'): # S3 URL case
|
| 156 |
+
url = cr.prompt_audio.decode('utf‑8')
|
| 157 |
+
logging.info("Downloading cross‑lingual prompt from %s", url)
|
| 158 |
+
resp = requests.get(url, timeout=10)
|
| 159 |
+
resp.raise_for_status()
|
| 160 |
+
|
| 161 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 162 |
+
f.write(resp.content)
|
| 163 |
+
tmp_path = f.name
|
| 164 |
+
|
| 165 |
+
wav, sr = sf.read(tmp_path, dtype='float32')
|
| 166 |
+
if wav.ndim > 1:
|
| 167 |
+
wav = wav.mean(axis=1)
|
| 168 |
+
if sr != 16_000:
|
| 169 |
+
wav = torchaudio.functional.resample(
|
| 170 |
+
torch.from_numpy(wav).unsqueeze(0), sr, 16_000
|
| 171 |
+
)[0].numpy()
|
| 172 |
+
prompt = torch.from_numpy(wav).unsqueeze(0)
|
| 173 |
+
|
| 174 |
+
else: # legacy raw bytes
|
| 175 |
+
prompt = _bytes_to_tensor(cr.prompt_audio)
|
| 176 |
+
|
| 177 |
+
mo = self.cosyvoice.inference_cross_lingual(
|
| 178 |
+
cr.tts_text,
|
| 179 |
+
prompt
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
finally:
|
| 183 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 184 |
+
try:
|
| 185 |
+
os.remove(tmp_path)
|
| 186 |
+
except Exception as e:
|
| 187 |
+
logging.warning("Could not remove temp file %s: %s",
|
| 188 |
+
tmp_path, e)
|
| 189 |
+
|
| 190 |
yield from _yield_audio(mo)
|
| 191 |
return
|
| 192 |
|
| 193 |
+
|
| 194 |
# 4. Instruction‑TTS (two flavours)
|
| 195 |
if request.HasField("instruct_request"):
|
| 196 |
ir = request.instruct_request
|