SanjayKumartheBuilder's picture
Update app.py
1ddb6fe verified
import os
import spaces
import gradio as gr
import torch
import tempfile
import requests
import json
import soundfile as sf
import torchaudio
# GPU device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸš€ Using device: {device}")
# HF Token setup
HF_TOKEN = os.getenv("HF_TOKEN")
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
print(f"πŸ”‘ HF_TOKEN configured: {'βœ… Yes' if HF_TOKEN else '❌ No'}")
def generate_base_speech(text):
"""Generate speech using HF VITS TTS model (24 kHz WAV)"""
if not HF_TOKEN:
print("❌ HF_TOKEN not configured!")
return None
endpoint = "https://api-inference.huggingface.co/models/tts_models/en/ljspeech/vits"
response = requests.post(endpoint, headers=HEADERS, json={"inputs": text}, timeout=60)
print(f"πŸ“Š TTS API Response: {response.status_code}, Body: {response.text}")
if response.status_code == 200:
out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
with open(out, "wb") as f:
f.write(response.content)
return out
return None
@spaces.GPU(duration=120)
def voice_cloning_pipeline(text, reference_audio_path):
"""Advanced Voice Cloning Pipeline using HF APIs"""
if not HF_TOKEN:
return None
print("🎀 Generating base speech…")
base_speech = generate_base_speech(text)
if not base_speech:
print("❌ Failed to generate base speech")
return None
# Trim and convert reference audio
wav, sr = torchaudio.load(reference_audio_path)
wav = wav[:, : min(sr*10, wav.shape[1])]
tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
torchaudio.save(tmp_ref, wav, sr=16000)
print("πŸ”„ Converting for API and calling VC endpoint…")
with open(base_speech, "rb") as bf, open(tmp_ref, "rb") as rf:
files = {"source_audio": bf, "target_audio": rf}
vc_endpoint = "https://api-inference.huggingface.co/models/microsoft/speecht5_vc"
response = requests.post(vc_endpoint, headers=HEADERS, files=files, timeout=60)
print(f"πŸ“Š Voice Conversion API Response: {response.status_code}, Body: {response.text}")
if response.status_code == 200:
out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
with open(out, "wb") as f:
f.write(response.content)
return out
return base_speech
@spaces.GPU(duration=180)
def avatar_video_pipeline(audio_path, reference_video_path):
"""Avatar video generation using HF APIs"""
if not HF_TOKEN:
return reference_video_path
with open(audio_path, "rb") as af, open(reference_video_path, "rb") as vf:
files = {"audio": af, "video": vf}
endpoint = "https://api-inference.huggingface.co/models/vinthony/SadTalker"
response = requests.post(endpoint, headers=HEADERS, files=files, timeout=180)
print(f"πŸ“Š Avatar API Response: {response.status_code}, Body: {response.text}")
if response.status_code == 200:
out = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
with open(out, "wb") as f:
f.write(response.content)
return out
return reference_video_path
@spaces.GPU(duration=300)
def full_ai_avatar_pipeline(reference_audio, text_script, reference_video):
"""Complete AI Avatar SaaS Pipeline"""
if not HF_TOKEN:
return ("❌ CONFIGURATION ERROR!\nπŸ”‘ HF_TOKEN missing in secrets.", None, None)
cloned = voice_cloning_pipeline(text_script, reference_audio)
if not cloned:
return ("❌ VOICE CLONING FAILED!\nCheck logs for details.", None, None)
video = avatar_video_pipeline(cloned, reference_video)
return ("βœ… AI AVATAR GENERATION COMPLETE!", cloned, video)
with gr.Blocks(theme=gr.themes.Soft(), title="AI Avatar SaaS - Professional Prototype") as demo:
gr.HTML("<h1>🎭 AI Avatar SaaS - Professional Prototype</h1>")
with gr.Row():
with gr.Column():
reference_audio = gr.Audio(label="🎀 Reference Audio", type="filepath")
text_input = gr.Textbox(label="πŸ“ Script Text", lines=5, placeholder="Enter script…")
reference_video = gr.Video(label="🎬 Reference Video", type="filepath")
btn = gr.Button("πŸš€ Generate AI Avatar")
with gr.Column():
status = gr.Textbox(label="πŸ”„ Status", lines=5, interactive=False)
cloned_out = gr.Audio(label="πŸ”Š Cloned Voice", type="filepath")
video_out = gr.Video(label="🎭 Final Avatar Video", type="filepath")
btn.click(full_ai_avatar_pipeline,
inputs=[reference_audio, text_input, reference_video],
outputs=[status, cloned_out, video_out])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_error=True)