wan2-2-fp8da-aoti-81-frames

Paused

App Files Files Community

cbensimon HF Staff commited on Aug 1

Commit

1b36628

1 Parent(s): 39b7e29

Revert "Revert to 8acf492"

Browse files

This reverts commit 39b7e2924078ef6cf9a66647334f1c268698166b.

Files changed (3) hide show

README.md +3 -3
app.py +11 -17
optimization.py +1 -7

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: Wan2 2 Fp8da Aoti
-emoji: 🐨
-colorFrom: gray
 colorTo: yellow
 sdk: gradio
 sdk_version: 5.38.2

 ---
+title: Wan 2.2 fp8-dynamic 81 frames
+emoji: 🎥
+colorFrom: red
 colorTo: yellow
 sdk: gradio
 sdk_version: 5.38.2

app.py CHANGED Viewed

@@ -27,9 +27,6 @@ FIXED_FPS = 24
 MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 81
-MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
-MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
 pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
     transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
@@ -56,7 +53,7 @@ optimize_pipeline_(pipe,
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
-default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
 def resize_image(image: Image.Image) -> Image.Image:
@@ -85,7 +82,7 @@ def get_duration(
     input_image,
     prompt,
     negative_prompt,
-    duration_seconds,
     guidance_scale,
     steps,
     seed,
@@ -99,9 +96,9 @@ def generate_video(
     input_image,
     prompt,
     negative_prompt=default_negative_prompt,
-    duration_seconds = MAX_DURATION,
-    guidance_scale = 1,
-    steps = 4,
     seed = 42,
     randomize_seed = False,
     progress=gr.Progress(track_tqdm=True),
@@ -118,8 +115,8 @@ def generate_video(
         prompt (str): Text prompt describing the desired animation or motion.
         negative_prompt (str, optional): Negative prompt to avoid unwanted elements.
             Defaults to default_negative_prompt (contains unwanted visual artifacts).
-        duration_seconds (float, optional): Duration of the generated video in seconds.
-            Defaults to 2. Clamped between MIN_FRAMES_MODEL/FIXED_FPS and MAX_FRAMES_MODEL/FIXED_FPS.
         guidance_scale (float, optional): Controls adherence to the prompt. Higher values = more adherence.
             Defaults to 1.0. Range: 0.0-20.0.
         steps (int, optional): Number of inference steps. More steps = higher quality but slower.
@@ -140,15 +137,12 @@ def generate_video(
     Note:
         - The function automatically resizes the input image to the target dimensions
-        - Frame count is calculated as duration_seconds * FIXED_FPS (24)
         - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
         - The function uses GPU acceleration via the @spaces.GPU decorator
-        - Generation time varies based on steps and duration (see get_duration function)
     """
     if input_image is None:
         raise gr.Error("Please upload an input image.")
-    num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     resized_image = resize_image(input_image)
@@ -178,14 +172,14 @@ with gr.Blocks() as demo:
         with gr.Column():
             input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
-            duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=MAX_DURATION, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
-                steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=4, label="Inference Steps")
-                guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
@@ -193,7 +187,7 @@ with gr.Blocks() as demo:
     ui_inputs = [
         input_image_component, prompt_input,
-        negative_prompt_input, duration_seconds_input,
         guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])

 MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 81
 pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
     transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
+default_negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 def resize_image(image: Image.Image) -> Image.Image:
     input_image,
     prompt,
     negative_prompt,
+    num_frames,
     guidance_scale,
     steps,
     seed,
     input_image,
     prompt,
     negative_prompt=default_negative_prompt,
+    num_frames = MAX_FRAMES_MODEL,
+    guidance_scale = 3.5,
+    steps = 28,
     seed = 42,
     randomize_seed = False,
     progress=gr.Progress(track_tqdm=True),
         prompt (str): Text prompt describing the desired animation or motion.
         negative_prompt (str, optional): Negative prompt to avoid unwanted elements.
             Defaults to default_negative_prompt (contains unwanted visual artifacts).
+        num_frames (int, optional): Number of frames.
+            Defaults to MAX_FRAMES_MODEL
         guidance_scale (float, optional): Controls adherence to the prompt. Higher values = more adherence.
             Defaults to 1.0. Range: 0.0-20.0.
         steps (int, optional): Number of inference steps. More steps = higher quality but slower.
     Note:
         - The function automatically resizes the input image to the target dimensions
         - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
         - The function uses GPU acceleration via the @spaces.GPU decorator
     """
     if input_image is None:
         raise gr.Error("Please upload an input image.")
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     resized_image = resize_image(input_image)
         with gr.Column():
             input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
+            num_frames_input = gr.Slider(minimum=MIN_FRAMES_MODEL, maximum=MAX_FRAMES_MODEL, step=1, value=MAX_FRAMES_MODEL, label="Frames")
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
+                steps_slider = gr.Slider(minimum=1, maximum=40, step=1, value=28, label="Inference Steps")
+                guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale")
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
     ui_inputs = [
         input_image_component, prompt_input,
+        negative_prompt_input, num_frames_input,
         guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])

optimization.py CHANGED Viewed

@@ -20,13 +20,7 @@ from optimization_utils import ZeroGPUCompiledModel
 P = ParamSpec('P')
-TRANSFORMER_NUM_FRAMES_DIM = torch.export.Dim('num_frames', min=3, max=21)
-TRANSFORMER_DYNAMIC_SHAPES = {
-    'hidden_states': {
-        2: TRANSFORMER_NUM_FRAMES_DIM,
-    },
-}
 INDUCTOR_CONFIGS = {
     'conv_1x1_as_mm': True,

 P = ParamSpec('P')
+TRANSFORMER_DYNAMIC_SHAPES = {}
 INDUCTOR_CONFIGS = {
     'conv_1x1_as_mm': True,