import os import spaces import gradio as gr import torch import tempfile import requests import json import soundfile as sf import torchaudio # GPU device setup device = "cuda" if torch.cuda.is_available() else "cpu" print(f"πŸš€ Using device: {device}") # HF Token setup HF_TOKEN = os.getenv("HF_TOKEN") HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} print(f"πŸ”‘ HF_TOKEN configured: {'βœ… Yes' if HF_TOKEN else '❌ No'}") def generate_base_speech(text): """Generate speech using HF VITS TTS model (24 kHz WAV)""" if not HF_TOKEN: print("❌ HF_TOKEN not configured!") return None endpoint = "https://api-inference.huggingface.co/models/tts_models/en/ljspeech/vits" response = requests.post(endpoint, headers=HEADERS, json={"inputs": text}, timeout=60) print(f"πŸ“Š TTS API Response: {response.status_code}, Body: {response.text}") if response.status_code == 200: out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name with open(out, "wb") as f: f.write(response.content) return out return None @spaces.GPU(duration=120) def voice_cloning_pipeline(text, reference_audio_path): """Advanced Voice Cloning Pipeline using HF APIs""" if not HF_TOKEN: return None print("🎀 Generating base speech…") base_speech = generate_base_speech(text) if not base_speech: print("❌ Failed to generate base speech") return None # Trim and convert reference audio wav, sr = torchaudio.load(reference_audio_path) wav = wav[:, : min(sr*10, wav.shape[1])] tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name torchaudio.save(tmp_ref, wav, sr=16000) print("πŸ”„ Converting for API and calling VC endpoint…") with open(base_speech, "rb") as bf, open(tmp_ref, "rb") as rf: files = {"source_audio": bf, "target_audio": rf} vc_endpoint = "https://api-inference.huggingface.co/models/microsoft/speecht5_vc" response = requests.post(vc_endpoint, headers=HEADERS, files=files, timeout=60) print(f"πŸ“Š Voice Conversion API Response: {response.status_code}, Body: {response.text}") if response.status_code == 200: out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name with open(out, "wb") as f: f.write(response.content) return out return base_speech @spaces.GPU(duration=180) def avatar_video_pipeline(audio_path, reference_video_path): """Avatar video generation using HF APIs""" if not HF_TOKEN: return reference_video_path with open(audio_path, "rb") as af, open(reference_video_path, "rb") as vf: files = {"audio": af, "video": vf} endpoint = "https://api-inference.huggingface.co/models/vinthony/SadTalker" response = requests.post(endpoint, headers=HEADERS, files=files, timeout=180) print(f"πŸ“Š Avatar API Response: {response.status_code}, Body: {response.text}") if response.status_code == 200: out = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name with open(out, "wb") as f: f.write(response.content) return out return reference_video_path @spaces.GPU(duration=300) def full_ai_avatar_pipeline(reference_audio, text_script, reference_video): """Complete AI Avatar SaaS Pipeline""" if not HF_TOKEN: return ("❌ CONFIGURATION ERROR!\nπŸ”‘ HF_TOKEN missing in secrets.", None, None) cloned = voice_cloning_pipeline(text_script, reference_audio) if not cloned: return ("❌ VOICE CLONING FAILED!\nCheck logs for details.", None, None) video = avatar_video_pipeline(cloned, reference_video) return ("βœ… AI AVATAR GENERATION COMPLETE!", cloned, video) with gr.Blocks(theme=gr.themes.Soft(), title="AI Avatar SaaS - Professional Prototype") as demo: gr.HTML("

🎭 AI Avatar SaaS - Professional Prototype

") with gr.Row(): with gr.Column(): reference_audio = gr.Audio(label="🎀 Reference Audio", type="filepath") text_input = gr.Textbox(label="πŸ“ Script Text", lines=5, placeholder="Enter script…") reference_video = gr.Video(label="🎬 Reference Video", type="filepath") btn = gr.Button("πŸš€ Generate AI Avatar") with gr.Column(): status = gr.Textbox(label="πŸ”„ Status", lines=5, interactive=False) cloned_out = gr.Audio(label="πŸ”Š Cloned Voice", type="filepath") video_out = gr.Video(label="🎭 Final Avatar Video", type="filepath") btn.click(full_ai_avatar_pipeline, inputs=[reference_audio, text_input, reference_video], outputs=[status, cloned_out, video_out]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_error=True)