|
|
import os |
|
|
import spaces |
|
|
import gradio as gr |
|
|
import torch |
|
|
import tempfile |
|
|
import requests |
|
|
import json |
|
|
import soundfile as sf |
|
|
import torchaudio |
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"π Using device: {device}") |
|
|
|
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} |
|
|
print(f"π HF_TOKEN configured: {'β
Yes' if HF_TOKEN else 'β No'}") |
|
|
|
|
|
def generate_base_speech(text): |
|
|
"""Generate speech using HF VITS TTS model (24 kHz WAV)""" |
|
|
if not HF_TOKEN: |
|
|
print("β HF_TOKEN not configured!") |
|
|
return None |
|
|
endpoint = "https://api-inference.huggingface.co/models/tts_models/en/ljspeech/vits" |
|
|
response = requests.post(endpoint, headers=HEADERS, json={"inputs": text}, timeout=60) |
|
|
print(f"π TTS API Response: {response.status_code}, Body: {response.text}") |
|
|
if response.status_code == 200: |
|
|
out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name |
|
|
with open(out, "wb") as f: |
|
|
f.write(response.content) |
|
|
return out |
|
|
return None |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def voice_cloning_pipeline(text, reference_audio_path): |
|
|
"""Advanced Voice Cloning Pipeline using HF APIs""" |
|
|
if not HF_TOKEN: |
|
|
return None |
|
|
|
|
|
print("π€ Generating base speechβ¦") |
|
|
base_speech = generate_base_speech(text) |
|
|
if not base_speech: |
|
|
print("β Failed to generate base speech") |
|
|
return None |
|
|
|
|
|
|
|
|
wav, sr = torchaudio.load(reference_audio_path) |
|
|
wav = wav[:, : min(sr*10, wav.shape[1])] |
|
|
tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name |
|
|
torchaudio.save(tmp_ref, wav, sr=16000) |
|
|
|
|
|
print("π Converting for API and calling VC endpointβ¦") |
|
|
with open(base_speech, "rb") as bf, open(tmp_ref, "rb") as rf: |
|
|
files = {"source_audio": bf, "target_audio": rf} |
|
|
vc_endpoint = "https://api-inference.huggingface.co/models/microsoft/speecht5_vc" |
|
|
response = requests.post(vc_endpoint, headers=HEADERS, files=files, timeout=60) |
|
|
print(f"π Voice Conversion API Response: {response.status_code}, Body: {response.text}") |
|
|
|
|
|
if response.status_code == 200: |
|
|
out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name |
|
|
with open(out, "wb") as f: |
|
|
f.write(response.content) |
|
|
return out |
|
|
return base_speech |
|
|
|
|
|
@spaces.GPU(duration=180) |
|
|
def avatar_video_pipeline(audio_path, reference_video_path): |
|
|
"""Avatar video generation using HF APIs""" |
|
|
if not HF_TOKEN: |
|
|
return reference_video_path |
|
|
|
|
|
with open(audio_path, "rb") as af, open(reference_video_path, "rb") as vf: |
|
|
files = {"audio": af, "video": vf} |
|
|
endpoint = "https://api-inference.huggingface.co/models/vinthony/SadTalker" |
|
|
response = requests.post(endpoint, headers=HEADERS, files=files, timeout=180) |
|
|
print(f"π Avatar API Response: {response.status_code}, Body: {response.text}") |
|
|
|
|
|
if response.status_code == 200: |
|
|
out = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name |
|
|
with open(out, "wb") as f: |
|
|
f.write(response.content) |
|
|
return out |
|
|
return reference_video_path |
|
|
|
|
|
@spaces.GPU(duration=300) |
|
|
def full_ai_avatar_pipeline(reference_audio, text_script, reference_video): |
|
|
"""Complete AI Avatar SaaS Pipeline""" |
|
|
if not HF_TOKEN: |
|
|
return ("β CONFIGURATION ERROR!\nπ HF_TOKEN missing in secrets.", None, None) |
|
|
|
|
|
cloned = voice_cloning_pipeline(text_script, reference_audio) |
|
|
if not cloned: |
|
|
return ("β VOICE CLONING FAILED!\nCheck logs for details.", None, None) |
|
|
|
|
|
video = avatar_video_pipeline(cloned, reference_video) |
|
|
return ("β
AI AVATAR GENERATION COMPLETE!", cloned, video) |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="AI Avatar SaaS - Professional Prototype") as demo: |
|
|
gr.HTML("<h1>π AI Avatar SaaS - Professional Prototype</h1>") |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
reference_audio = gr.Audio(label="π€ Reference Audio", type="filepath") |
|
|
text_input = gr.Textbox(label="π Script Text", lines=5, placeholder="Enter scriptβ¦") |
|
|
reference_video = gr.Video(label="π¬ Reference Video", type="filepath") |
|
|
btn = gr.Button("π Generate AI Avatar") |
|
|
with gr.Column(): |
|
|
status = gr.Textbox(label="π Status", lines=5, interactive=False) |
|
|
cloned_out = gr.Audio(label="π Cloned Voice", type="filepath") |
|
|
video_out = gr.Video(label="π Final Avatar Video", type="filepath") |
|
|
btn.click(full_ai_avatar_pipeline, |
|
|
inputs=[reference_audio, text_input, reference_video], |
|
|
outputs=[status, cloned_out, video_out]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_error=True) |
|
|
|
|
|
|