E2-F5-TTS

Running

App Files Files Community

kevinwang676 commited on May 13

Commit

cbe014c

verified ·

1 Parent(s): a35e94c

Update speech_edit.py

Browse files

Files changed (1) hide show

speech_edit.py +41 -6

speech_edit.py CHANGED Viewed

@@ -8,7 +8,8 @@ CosyVoice gRPC back‑end – updated to mirror the FastAPI logic
 *   inference_instruct2  ➜  new:  prompt‑audio + speed (no speaker‑ID)
 """
-import io, os, tempfile, requests, soundfile as sf, torchaudio
 import sys
 from concurrent import futures
 import argparse
@@ -148,14 +149,48 @@ class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
         if request.HasField("cross_lingual_request"):
             logging.info("Received cross‑lingual inference request")
             cr = request.cross_lingual_request
-            prompt = _bytes_to_tensor(cr.prompt_audio)
-            mo = self.cosyvoice.inference_cross_lingual(
-                cr.tts_text,
-                prompt
-            )
             yield from _yield_audio(mo)
             return
         # 4. Instruction‑TTS (two flavours)
         if request.HasField("instruct_request"):
             ir = request.instruct_request

 *   inference_instruct2  ➜  new:  prompt‑audio + speed (no speaker‑ID)
 """
+import io, tempfile, requests, soundfile as sf, torchaudio
+import os
 import sys
 from concurrent import futures
 import argparse
         if request.HasField("cross_lingual_request"):
             logging.info("Received cross‑lingual inference request")
             cr = request.cross_lingual_request
+            tmp_path = None
+            try:
+                if cr.prompt_audio.startswith(b'http'):          # S3 URL case
+                    url = cr.prompt_audio.decode('utf‑8')
+                    logging.info("Downloading cross‑lingual prompt from %s", url)
+                    resp = requests.get(url, timeout=10)
+                    resp.raise_for_status()
+                    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+                        f.write(resp.content)
+                        tmp_path = f.name
+                    wav, sr = sf.read(tmp_path, dtype='float32')
+                    if wav.ndim > 1:
+                        wav = wav.mean(axis=1)
+                    if sr != 16_000:
+                        wav = torchaudio.functional.resample(
+                            torch.from_numpy(wav).unsqueeze(0), sr, 16_000
+                        )[0].numpy()
+                    prompt = torch.from_numpy(wav).unsqueeze(0)
+                else:                                           # legacy raw bytes
+                    prompt = _bytes_to_tensor(cr.prompt_audio)
+                mo = self.cosyvoice.inference_cross_lingual(
+                    cr.tts_text,
+                    prompt
+                )
+            finally:
+                if tmp_path and os.path.exists(tmp_path):
+                    try:
+                        os.remove(tmp_path)
+                    except Exception as e:
+                        logging.warning("Could not remove temp file %s: %s",
+                                        tmp_path, e)
             yield from _yield_audio(mo)
             return
         # 4. Instruction‑TTS (two flavours)
         if request.HasField("instruct_request"):
             ir = request.instruct_request