import os import spaces import gradio as gr import torch import tempfile import requests import json import soundfile as sf import torchaudio # GPU device setup device = "cuda" if torch.cuda.is_available() else "cpu" print(f"π Using device: {device}") # HF Token setup HF_TOKEN = os.getenv("HF_TOKEN") HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} print(f"π HF_TOKEN configured: {'β Yes' if HF_TOKEN else 'β No'}") def generate_base_speech(text): """Generate speech using HF VITS TTS model (24 kHz WAV)""" if not HF_TOKEN: print("β HF_TOKEN not configured!") return None endpoint = "https://api-inference.huggingface.co/models/tts_models/en/ljspeech/vits" response = requests.post(endpoint, headers=HEADERS, json={"inputs": text}, timeout=60) print(f"π TTS API Response: {response.status_code}, Body: {response.text}") if response.status_code == 200: out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name with open(out, "wb") as f: f.write(response.content) return out return None @spaces.GPU(duration=120) def voice_cloning_pipeline(text, reference_audio_path): """Advanced Voice Cloning Pipeline using HF APIs""" if not HF_TOKEN: return None print("π€ Generating base speechβ¦") base_speech = generate_base_speech(text) if not base_speech: print("β Failed to generate base speech") return None # Trim and convert reference audio wav, sr = torchaudio.load(reference_audio_path) wav = wav[:, : min(sr*10, wav.shape[1])] tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name torchaudio.save(tmp_ref, wav, sr=16000) print("π Converting for API and calling VC endpointβ¦") with open(base_speech, "rb") as bf, open(tmp_ref, "rb") as rf: files = {"source_audio": bf, "target_audio": rf} vc_endpoint = "https://api-inference.huggingface.co/models/microsoft/speecht5_vc" response = requests.post(vc_endpoint, headers=HEADERS, files=files, timeout=60) print(f"π Voice Conversion API Response: {response.status_code}, Body: {response.text}") if response.status_code == 200: out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name with open(out, "wb") as f: f.write(response.content) return out return base_speech @spaces.GPU(duration=180) def avatar_video_pipeline(audio_path, reference_video_path): """Avatar video generation using HF APIs""" if not HF_TOKEN: return reference_video_path with open(audio_path, "rb") as af, open(reference_video_path, "rb") as vf: files = {"audio": af, "video": vf} endpoint = "https://api-inference.huggingface.co/models/vinthony/SadTalker" response = requests.post(endpoint, headers=HEADERS, files=files, timeout=180) print(f"π Avatar API Response: {response.status_code}, Body: {response.text}") if response.status_code == 200: out = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name with open(out, "wb") as f: f.write(response.content) return out return reference_video_path @spaces.GPU(duration=300) def full_ai_avatar_pipeline(reference_audio, text_script, reference_video): """Complete AI Avatar SaaS Pipeline""" if not HF_TOKEN: return ("β CONFIGURATION ERROR!\nπ HF_TOKEN missing in secrets.", None, None) cloned = voice_cloning_pipeline(text_script, reference_audio) if not cloned: return ("β VOICE CLONING FAILED!\nCheck logs for details.", None, None) video = avatar_video_pipeline(cloned, reference_video) return ("β AI AVATAR GENERATION COMPLETE!", cloned, video) with gr.Blocks(theme=gr.themes.Soft(), title="AI Avatar SaaS - Professional Prototype") as demo: gr.HTML("