C4G-HKUST commited on
Commit
37e0f4b
·
1 Parent(s): 7b1f748

Add two generation modes: Fast (240s, 15 steps) and Quality (720s, custom steps)

Browse files
Files changed (2) hide show
  1. README.md +16 -1
  2. app.py +52 -7
README.md CHANGED
@@ -207,7 +207,22 @@ python app.py
207
  ```
208
  <p align="center">
209
  <img src="assets/gradio.png"><br>
210
- </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
 
213
 
 
207
  ```
208
  <p align="center">
209
  <img src="assets/gradio.png"><br>
210
+ </p>
211
+
212
+ #### Generation Modes
213
+ The Gradio demo provides two generation modes:
214
+
215
+ - **Fast Mode (240s GPU duration)**:
216
+ - Fixed 15 denoising steps for quick generation
217
+ - Suitable for single-person videos or quick previews
218
+ - Lower GPU usage quota consumption
219
+
220
+ - **Quality Mode (720s GPU duration)**:
221
+ - Custom denoising steps (adjustable via "Diffusion steps" slider)
222
+ - Recommended for multi-person videos that require higher quality
223
+ - Longer generation time but better quality output
224
+
225
+ **Design Rationale**: Multi-person videos generally have longer duration and require more computational resources. To achieve better quality, especially for complex multi-person interactions, more denoising steps and longer GPU allocation time are needed. The Quality Mode provides sufficient Usage Quota (720 seconds) to accommodate these requirements, while the Fast Mode offers a quick preview option with fixed 15 steps for faster iteration.
226
 
227
 
228
 
app.py CHANGED
@@ -436,7 +436,7 @@ def run_graio_demo(args):
436
  logging.info("Model and face processor loaded successfully.")
437
 
438
  def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
439
- sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
440
  # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
441
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
442
  # @spaces.GPU 装饰器已经初始化了 GPU,这里直接使用即可
@@ -493,6 +493,13 @@ def run_graio_demo(args):
493
  # 读取图片
494
  img = Image.open(input_data["cond_image"]).convert("RGB")
495
 
 
 
 
 
 
 
 
496
  # 生成视频
497
  video = wan_a2v.generate(
498
  input_data["prompt"],
@@ -502,7 +509,7 @@ def run_graio_demo(args):
502
  frame_num=current_frame_num,
503
  shift=args.sample_shift,
504
  sample_solver=args.sample_solver,
505
- sampling_steps=sd_steps,
506
  guide_scale=guide_scale,
507
  seed=seed if seed >= 0 else args.base_seed,
508
  offload_model=args.offload_model,
@@ -598,8 +605,21 @@ def run_graio_demo(args):
598
  # 使用 @spaces.GPU 装饰器包装 generate_video 函数(参考 LivePortrait)
599
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
600
  # @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
601
- @spaces.GPU(duration=120)
602
- def gpu_wrapped_generate_video(*args, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
603
  # 在 worker 进程中将模型移动到 GPU(如果模型在 CPU 上)
604
  # 参考 LivePortrait: 在 worker 进程中直接使用 .to("cuda")
605
  if torch.cuda.is_available() and device == -1:
@@ -736,7 +756,24 @@ def run_graio_demo(args):
736
  value="bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
737
  )
738
 
739
- run_i2v_button = gr.Button("Generate Video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
 
741
  with gr.Column(scale=2):
742
  result_gallery = gr.Video(
@@ -770,8 +807,16 @@ def run_graio_demo(args):
770
  )
771
 
772
 
773
- run_i2v_button.click(
774
- fn=gpu_wrapped_generate_video, # 使用 GPU 包装函数
 
 
 
 
 
 
 
 
775
  inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
776
  outputs=[result_gallery],
777
  )
 
436
  logging.info("Model and face processor loaded successfully.")
437
 
438
  def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
439
+ sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None):
440
  # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
441
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
442
  # @spaces.GPU 装饰器已经初始化了 GPU,这里直接使用即可
 
493
  # 读取图片
494
  img = Image.open(input_data["cond_image"]).convert("RGB")
495
 
496
+ # 如果提供了 fixed_steps,使用它;否则使用用户选择的 sd_steps
497
+ actual_steps = fixed_steps if fixed_steps is not None else sd_steps
498
+ if fixed_steps is not None:
499
+ logging.info(f"Using fixed denoising steps: {fixed_steps}")
500
+ else:
501
+ logging.info(f"Using user-selected denoising steps: {sd_steps}")
502
+
503
  # 生成视频
504
  video = wan_a2v.generate(
505
  input_data["prompt"],
 
509
  frame_num=current_frame_num,
510
  shift=args.sample_shift,
511
  sample_solver=args.sample_solver,
512
+ sampling_steps=actual_steps,
513
  guide_scale=guide_scale,
514
  seed=seed if seed >= 0 else args.base_seed,
515
  offload_model=args.offload_model,
 
605
  # 使用 @spaces.GPU 装饰器包装 generate_video 函数(参考 LivePortrait)
606
  # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/app.py
607
  # @spaces.GPU 装饰器会自动处理 GPU 初始化,不需要手动初始化
608
+
609
+ # 快速生成模式:240秒,固定15步去噪
610
+ @spaces.GPU(duration=240)
611
+ def gpu_wrapped_generate_video_fast(*args, **kwargs):
612
+ # 固定使用15步去噪,通过关键字参数传递
613
+ kwargs['fixed_steps'] = 15
614
+ return gpu_wrapped_generate_video_worker(*args, **kwargs)
615
+
616
+ # 高质量生成模式:720秒,用户选择去噪步数
617
+ @spaces.GPU(duration=720)
618
+ def gpu_wrapped_generate_video_quality(*args, **kwargs):
619
+ return gpu_wrapped_generate_video_worker(*args, **kwargs)
620
+
621
+ # 共享的 worker 函数,处理 GPU 移动逻辑
622
+ def gpu_wrapped_generate_video_worker(*args, **kwargs):
623
  # 在 worker 进程中将模型移动到 GPU(如果模型在 CPU 上)
624
  # 参考 LivePortrait: 在 worker 进程中直接使用 .to("cuda")
625
  if torch.cuda.is_available() and device == -1:
 
756
  value="bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
757
  )
758
 
759
+ with gr.Row():
760
+ run_i2v_button_fast = gr.Button(
761
+ "Generate Video (Fast - 240s, 15 steps)",
762
+ variant="secondary",
763
+ scale=1
764
+ )
765
+ run_i2v_button_quality = gr.Button(
766
+ "Generate Video (Quality - 720s, Custom steps)",
767
+ variant="primary",
768
+ scale=1
769
+ )
770
+ gr.Markdown("""
771
+ **Generation Modes:**
772
+ - **Fast Mode (240s)**: Fixed 15 denoising steps for quick generation. Suitable for single-person videos or quick previews.
773
+ - **Quality Mode (720s)**: Custom denoising steps (adjustable via "Diffusion steps" slider). Recommended for multi-person videos that require higher quality and longer generation time.
774
+
775
+ *Note: Multi-person videos generally require longer duration and more Usage Quota for better quality.*
776
+ """)
777
 
778
  with gr.Column(scale=2):
779
  result_gallery = gr.Video(
 
807
  )
808
 
809
 
810
+ # 快速生成按钮:240秒,固定15步
811
+ run_i2v_button_fast.click(
812
+ fn=gpu_wrapped_generate_video_fast,
813
+ inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
814
+ outputs=[result_gallery],
815
+ )
816
+
817
+ # 高质量生成按钮:720秒,用户选择步数
818
+ run_i2v_button_quality.click(
819
+ fn=gpu_wrapped_generate_video_quality,
820
  inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
821
  outputs=[result_gallery],
822
  )