LTX-Video - Geração de Vídeo Multi-Scale (FP8)

import torch
import gradio as gr
import torch
import numpy as np
import random
import os
import yaml
from typing import Optional
from pathlib import Path
import imageio
import tempfile
from PIL import Image
from huggingface_hub import snapshot_download
from huggingface_hub import hf_hub_download
import shutil
import sys
from diffusers import LTXImageToVideoPipeline
from diffusers.utils import export_to_video, load_image

pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
pipe.to("cuda")

image = load_image(
    "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
)
prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"


# --- FUNÇÃO DE GERAÇÃO PRINCIPAL ---
def generate(
    prompt: str, image_input: Optional[str],
    target_height: int, target_width: int, num_frames: int, seed: int,
    progress=gr.Progress(track_tqdm=True)
):
    seed_everething(seed)
   
    #conditioning_items = None
    #if image_input:
    #    progress(0.1, desc="Preparando imagem de condição...")
    #    media_tensor = load_media_file(
    #        media_path=image_input, height=target_height, width=target_width,
    #        max_frames=1, padding=padding_values, just_crop=True
    #    )
    #    conditioning_items = [ConditioningItem(media_tensor.to(DEVICE, dtype=DTYPE), 0, 1.0)]
        
    video = pipe(
        image=load_image(image),
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=480,
        height=480,
        num_frames=120,
        num_inference_steps=50,
    ).frames[0]
    

    export_to_video(video_np, str(output_video_path), fps=24)
    
    return output_video_path

# --- UI GRADIO ---
with gr.Blocks(title="LTX-Video (Final)", theme=gr.themes.Soft()) as demo:
    gr.HTML("<h1>LTX-Video - Geração de Vídeo Multi-Scale (FP8)</h1>")
    with gr.Row():
        with gr.Column(scale=1):
            image_in = gr.Image(type="filepath", label="Imagem de Entrada (Opcional)")
            prompt_in = gr.Textbox(label="Prompt", lines=4, placeholder="Ex: a cinematic shot...")
            with gr.Accordion("Parâmetros", open=True):
                height_in = gr.Slider(label="Altura", minimum=256, maximum=1024, step=32, value=480)
                width_in = gr.Slider(label="Largura", minimum=256, maximum=1280, step=32, value=832)
                frames_in = gr.Slider(label="Frames", minimum=17, maximum=161, step=8, value=97)
                seed_in = gr.Number(label="Seed", value=42, precision=0)
            run_button = gr.Button("Gerar Vídeo", variant="primary")
        with gr.Column(scale=1):
            video_out = gr.Video(label="Vídeo Gerado")

    run_button.click(
        fn=generate,
        inputs=[prompt_in, image_in, height_in, width_in, frames_in, seed_in],
        outputs=[video_out],
    )

if __name__ == "__main__":
    demo.queue().launch(
        server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
        server_port=int(os.getenv("GRADIO_SERVER_PORT", "7861")),
        show_error=True,
    )