Spaces:

Qwen
/

Qwen-TTS-Clone-Demo

Running

App Files Files Community

littlebird13 commited on 21 days ago

Commit

04fb7a1

verified ·

1 Parent(s): 3abe8c9

Create app.py

Browse files

Files changed (1) hide show

app.py +286 -0

app.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import gradio as gr
+import os
+import requests
+import base64
+import pathlib
+import threading
+import tempfile
+from dashscope.audio.qwen_tts_realtime import QwenTtsRealtime, QwenTtsRealtimeCallback, AudioFormat
+import dashscope
+import wave
+import numpy as np
+# ======= Constants Configuration =======
+DEFAULT_TARGET_MODEL = "qwen3-tts-vc-realtime-2025-11-27"
+DEFAULT_PREFERRED_NAME = "custom_voice"
+DEFAULT_AUDIO_MIME_TYPE = "audio/wav"
+def init_dashscope_api_key():
+    """Initialize the API key for dashscope SDK"""
+    api_key = os.environ['API_KEY']
+    if not api_key:
+        raise ValueError("Please set the environment variable DASHSCOPE_API_KEY")
+    dashscope.api_key = api_key
+    return api_key
+def create_voice(file_path: str,
+                 target_model: str = DEFAULT_TARGET_MODEL,
+                 preferred_name: str = DEFAULT_PREFERRED_NAME,
+                 audio_mime_type: str = DEFAULT_AUDIO_MIME_TYPE) -> str:
+    """Create voice and return the voice parameter"""
+    api_key = os.environ['API_KEY']
+    file_path_obj = pathlib.Path(file_path)
+    if not file_path_obj.exists():
+        raise FileNotFoundError(f"Audio file not found: {file_path}")
+    base64_str = base64.b64encode(file_path_obj.read_bytes()).decode()
+    data_uri = f"data:{audio_mime_type};base64,{base64_str}"
+    url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
+    payload = {
+        "model": "qwen-voice-enrollment",
+        "input": {
+            "action": "create",
+            "target_model": target_model,
+            "preferred_name": preferred_name,
+            "audio": {"data": data_uri}
+        }
+    }
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    # Create session and configure retry and SSL
+    session = requests.Session()
+    # If SSL errors persist, temporarily change to False (for testing only)
+    session.verify = True  # Enable SSL verification
+    # Configure retry strategy
+    from requests.adapters import HTTPAdapter
+    from urllib3.util.retry import Retry
+    retry_strategy = Retry(
+        total=3,
+        backoff_factor=1,
+        status_forcelist=[429, 500, 502, 503, 504],
+        allowed_methods=["POST"]
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+    try:
+        resp = session.post(url, json=payload, headers=headers, timeout=60)
+        if resp.status_code != 200:
+            raise RuntimeError(f"Failed to create voice: {resp.status_code}, {resp.text}")
+        return resp.json()["output"]["voice"]
+    except requests.exceptions.SSLError as e:
+        raise RuntimeError(f"SSL connection error: {e}. Please check network environment or try using a proxy")
+    except requests.exceptions.Timeout as e:
+        raise RuntimeError(f"Request timeout: {e}")
+    except (KeyError, ValueError) as e:
+        raise RuntimeError(f"Failed to parse voice response: {e}")
+    finally:
+        session.close()
+class TTSCallback(QwenTtsRealtimeCallback):
+    """TTS streaming callback for collecting audio data"""
+    def __init__(self):
+        self.complete_event = threading.Event()
+        self.audio_chunks = []
+        self.error_msg = None
+    def on_open(self) -> None:
+        print('[TTS] Connection established')
+    def on_close(self, close_status_code, close_msg) -> None:
+        print(f'[TTS] Connection closed code={close_status_code}, msg={close_msg}')
+    def on_event(self, response: dict) -> None:
+        try:
+            event_type = response.get('type', '')
+            if event_type == 'session.created':
+                print(f'[TTS] Session started: {response["session"]["id"]}')
+            elif event_type == 'response.audio.delta':
+                audio_data = base64.b64decode(response['delta'])
+                self.audio_chunks.append(audio_data)
+            elif event_type == 'response.done':
+                print('[TTS] Response completed')
+            elif event_type == 'session.finished':
+                print('[TTS] Session finished')
+                self.complete_event.set()
+        except Exception as e:
+            self.error_msg = str(e)
+            print(f'[Error] Exception while processing callback event: {e}')
+            self.complete_event.set()
+    def wait_for_finished(self):
+        self.complete_event.wait()
+    def get_audio_data(self):
+        """Return the synthesized audio data"""
+        return b''.join(self.audio_chunks)
+def synthesize_speech(audio_file, text_input):
+    """
+    Main function for speech synthesis
+    Args:
+        audio_file: Path to the recorded audio file (from Gradio audio component)
+        text_input: Text to synthesize
+    Returns:
+        Path to the synthesized audio file
+    """
+    try:
+        if not audio_file:
+            return None, "❌ Please record a voice sample first"
+        if not text_input or text_input.strip() == "":
+            return None, "❌ Please enter the text to synthesize"
+        # Initialize API Key
+        init_dashscope_api_key()
+        # Create voice clone
+        status_msg = "🎤 Creating voice clone..."
+        print(status_msg)
+        voice_id = create_voice(audio_file, audio_mime_type="audio/wav")
+        # Initialize TTS
+        status_msg = "🔊 Synthesizing speech..."
+        print(status_msg)
+        callback = TTSCallback()
+        qwen_tts_realtime = QwenTtsRealtime(
+            model=DEFAULT_TARGET_MODEL,
+            callback=callback,
+            url='wss://dashscope.aliyuncs.com/api-ws/v1/realtime'
+        )
+        qwen_tts_realtime.connect()
+        # Update session configuration
+        qwen_tts_realtime.update_session(
+            voice=voice_id,
+            response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
+            mode='server_commit'
+        )
+        # Send text
+        qwen_tts_realtime.append_text(text_input)
+        qwen_tts_realtime.finish()
+        # Wait for completion
+        callback.wait_for_finished()
+        if callback.error_msg:
+            return None, f"❌ Synthesis failed: {callback.error_msg}"
+        # Get audio data and save as WAV file
+        audio_data = callback.get_audio_data()
+        if not audio_data:
+            return None, "❌ No audio data generated"
+        # Create temporary file to save audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+            output_path = tmp_file.name
+            # Write WAV file header
+            with wave.open(output_path, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Mono
+                wav_file.setsampwidth(2)  # 16bit
+                wav_file.setframerate(24000)  # 24kHz
+                wav_file.writeframes(audio_data)
+        success_msg = f"✅ Synthesis successful! Session ID: {qwen_tts_realtime.get_session_id()}"
+        print(success_msg)
+        return output_path, success_msg
+    except Exception as e:
+        error_msg = f"❌ An error occurred: {str(e)}"
+        print(error_msg)
+        return None, error_msg
+# ======= Gradio Interface =======
+def create_gradio_interface():
+    """Create Gradio interface"""
+    with gr.Blocks(title="Qwen Voice Cloning and Synthesis", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🎙️ Qwen Voice Cloning and Synthesis
+        **Usage Steps:**
+        1. Click the microphone icon to record a voice sample (recommended 10-30 seconds, clear and natural)
+        2. Enter the text content to synthesize
+        3. Click the "Start Synthesis" button
+        4. Wait for synthesis to complete, then play or download the result
+        **Notes:**
+        - Please ensure the environment variable `DASHSCOPE_API_KEY` is set
+        - Better recording quality leads to better synthesis results
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### Step 1: Record Voice Sample")
+                audio_input = gr.Audio(
+                    sources=["microphone"],
+                    type="filepath",
+                    label="Record Voice",
+                    format="wav"
+                )
+                gr.Markdown("### Step 2: Enter Text to Synthesize")
+                text_input = gr.Textbox(
+                    label="Text to Synthesize",
+                    placeholder="Please enter the text content to synthesize...",
+                    lines=5,
+                    value="Hello, this is a voice synthesized using voice cloning technology."
+                )
+                submit_btn = gr.Button("🎵 Start Synthesis", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                gr.Markdown("### Synthesis Result")
+                status_output = gr.Textbox(
+                    label="Status Information",
+                    interactive=False,
+                    lines=2
+                )
+                audio_output = gr.Audio(
+                    label="Synthesized Voice",
+                    type="filepath"
+                )
+        # Bind events
+        submit_btn.click(
+            fn=synthesize_speech,
+            inputs=[audio_input, text_input],
+            outputs=[audio_output, status_output]
+        )
+        gr.Markdown("""
+        ---
+        💡 **Tip:** For better results, please ensure a quiet recording environment and clear, natural pronunciation.
+        """)
+    return demo
+if __name__ == "__main__":
+    # Check API Key
+    try:
+        init_dashscope_api_key()
+        print("✅ API Key verified successfully")
+    except ValueError as e:
+        print(f"⚠️  Warning: {e}")
+        print("Please set the environment variable: export DASHSCOPE_API_KEY='your-api-key'")
+    demo = create_gradio_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )