danhtran2mind commited on
Commit
e8d5a56
·
verified ·
1 Parent(s): 9827a61

Upload 43 files

Browse files
Files changed (39) hide show
  1. .gitattributes +7 -0
  2. .python-version +1 -0
  3. LICENSE +21 -0
  4. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.mp4 +3 -0
  5. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/config.json +15 -0
  6. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/A_girl_is_walking_with_Ghibli_style_0.mp4 +0 -0
  7. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/config.json +15 -0
  8. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.mp4 +3 -0
  9. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/config.json +15 -0
  10. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.mp4 +3 -0
  11. apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/config.json +15 -0
  12. apps/gradio_app.py +136 -0
  13. apps/gradio_app/__init__.py +0 -0
  14. apps/gradio_app/abc.py +0 -0
  15. apps/gradio_app/inference.py +104 -0
  16. apps/gradio_app/new-inference.py +104 -0
  17. apps/gradio_app/old-inference.py +73 -0
  18. apps/gradio_app/setup_scripts.py +46 -0
  19. apps/gradio_app/static/__init__.py +0 -0
  20. apps/gradio_app/static/scripts.js +50 -0
  21. apps/gradio_app/static/styles.css +154 -0
  22. assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.gif +3 -0
  23. assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/A_girl_is_walking_with_Ghibli_style_0.gif +3 -0
  24. assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/config.json +1 -1
  25. assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.gif +3 -0
  26. assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/config.json +1 -1
  27. assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.gif +3 -0
  28. assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/config.json +1 -1
  29. configs/config_multi_videos.yaml +131 -0
  30. notebooks/zeroscope_v2_576w_Ghibli_LoRA-Inference.ipynb +0 -0
  31. notebooks/zeroscope_v2_576w_Ghibli_LoRA-Training.ipynb +802 -0
  32. requirements/requirements.txt +26 -0
  33. requirements/requirements_compatible.txt +23 -0
  34. scripts/download_ckpts.py +96 -0
  35. scripts/process_dataset.py +48 -0
  36. scripts/setup_third_party.py +38 -0
  37. src/text2video_ghibli_style/inference.py +96 -0
  38. src/text2video_ghibli_style/train.py +73 -0
  39. src/third_party/.gitkeep +0 -0
.gitattributes CHANGED
@@ -39,3 +39,10 @@ assets/zeroscope_v2_576w-Ghibli-LoRA/examples/4/Studio_Ghibli_style_Two_women_wa
39
  assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.mp4 filter=lfs diff=lfs merge=lfs -text
40
  assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.mp4 filter=lfs diff=lfs merge=lfs -text
41
  assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
39
  assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.mp4 filter=lfs diff=lfs merge=lfs -text
40
  assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.mp4 filter=lfs diff=lfs merge=lfs -text
41
  assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.mp4 filter=lfs diff=lfs merge=lfs -text
43
+ apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.gif filter=lfs diff=lfs merge=lfs -text
46
+ assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/A_girl_is_walking_with_Ghibli_style_0.gif filter=lfs diff=lfs merge=lfs -text
47
+ assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.gif filter=lfs diff=lfs merge=lfs -text
48
+ assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.gif filter=lfs diff=lfs merge=lfs -text
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11.13
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Danh Tran
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a77fed344231dc9e9cf3b271646183b84c2edbe94cd15bf2d2b192cec9ac89ae
3
+ size 288959
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "A dog is running with Ghibli style",
3
+ "negative-prompt": "ugly, noise, fragment, blur, static video",
4
+ "height": 512,
5
+ "width": 288,
6
+ "num-frames": 24,
7
+ "num-steps": 50,
8
+ "guidance_scale": 12,
9
+ "fps": 16,
10
+ "lora_rank": 64,
11
+ "lora_scale": 1.0,
12
+ "noise_prior": 0.0,
13
+ "seed": 42,
14
+ "video": "A_dog_is_running_with_Ghibli_style_42.mp4"
15
+ }
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/A_girl_is_walking_with_Ghibli_style_0.mp4 ADDED
Binary file (60.9 kB). View file
 
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "A girl is walking with Ghibli style",
3
+ "negative-prompt": "ugly, noise, fragment, blur, static video",
4
+ "height": 384,
5
+ "width": 384,
6
+ "num-frames": 28,
7
+ "num-steps": 50,
8
+ "guidance_scale": 15,
9
+ "fps": 16,
10
+ "lora_rank": 128,
11
+ "lora_scale": 0.8,
12
+ "noise_prior": 0.3,
13
+ "seed": 0,
14
+ "video": "A_girl_is_walking_with_Ghibli_style_0.mp4"
15
+ }
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a833dba0ad2cd264556d40e586b3a1cb7656e0239f9cae30f82ea635ed75d3b
3
+ size 156033
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "Studio Ghibli style. Young man contemplates, walks away from ivy-covered yellow building.",
3
+ "negative-prompt": "ugly, noise, fragment, blur, static video",
4
+ "height": 384,
5
+ "width": 384,
6
+ "num-frames": 28,
7
+ "num-steps": 50,
8
+ "guidance_scale": 15,
9
+ "fps": 16,
10
+ "lora_rank": 32,
11
+ "lora_scale": 0.9,
12
+ "noise_prior": 0.3,
13
+ "seed": 12345,
14
+ "video": "Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.mp4"
15
+ }
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd172829c209c5cc4fc064190891161a1145adceda7766676b8b8d8d57100156
3
+ size 134892
apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "prompt": "Studio Ghibli style. Two women walk down coastal village path toward sea, passing colorful houses, sailboats visible.",
3
+ "negative-prompt": "ugly, noise, fragment, blur, static video",
4
+ "height": 512,
5
+ "width": 512,
6
+ "num-frames": 16,
7
+ "num-steps": 50,
8
+ "guidance_scale": 30,
9
+ "fps": 16,
10
+ "lora_rank": 96,
11
+ "lora_scale": 0.7,
12
+ "noise_prior": 0.1,
13
+ "seed": 100,
14
+ "video": "Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.mp4"
15
+ }
apps/gradio_app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import json
4
+ from gradio_app.inference import run_inference, run_setup_script
5
+
6
+ def create_app():
7
+ # Run setup script at startup
8
+ setup_output = run_setup_script()
9
+ # Load CSS file
10
+ CSS = open("apps/gradio_app/static/styles.css", "r").read()
11
+
12
+ with gr.Blocks(css=CSS) as app:
13
+ gr.HTML('<script src="file=apps/gradio_app/static/scripts.js"></script>')
14
+ gr.Markdown(
15
+ """
16
+ # Text to Video Ghibli style
17
+ Generate videos using the `zeroscope_v2_576w` model with Studio Ghibli style LoRA.
18
+ """
19
+ )
20
+
21
+ with gr.Row(elem_classes="row-container"):
22
+ with gr.Column(elem_classes="column-container"):
23
+ model_path = gr.Dropdown(
24
+ label="Base Model",
25
+ choices=["./ckpts/zeroscope_v2_576w"],
26
+ value="./ckpts/zeroscope_v2_576w"
27
+ )
28
+ checkpoint_folder = gr.Dropdown(
29
+ label="LoRA folder",
30
+ choices=["./ckpts/zeroscope_v2_576w-Ghibli-LoRA"],
31
+ value="./ckpts/zeroscope_v2_576w-Ghibli-LoRA"
32
+ )
33
+ prompt = gr.Textbox(
34
+ label="Prompt",
35
+ value="Studio Ghibli style. Two women walk down coastal village path toward sea, passing colorful houses, sailboats visible."
36
+ )
37
+ negative_prompt = gr.Textbox(
38
+ label="Negative Prompt",
39
+ value="ugly, noise, fragment, blur, static video"
40
+ )
41
+
42
+ # Video Dimensions & Timing
43
+ with gr.Row(elem_classes="slider-row"):
44
+ with gr.Group(elem_classes="slider-group"):
45
+ gr.Markdown("### Video Dimensions & Timing")
46
+ width = gr.Slider(label="Width", minimum=256, maximum=1024, step=8, value=512)
47
+ height = gr.Slider(label="Height", minimum=256, maximum=1024, step=8, value=512)
48
+ num_frames = gr.Slider(label="Number of Frames", minimum=8, maximum=64, step=1, value=16)
49
+ fps = gr.Slider(label="FPS", minimum=10, maximum=60, step=1, value=16)
50
+ seed = gr.Number(label="Seed", value=100)
51
+
52
+ generate_btn = gr.Button("Generate Video", elem_classes="generate-btn")
53
+
54
+ with gr.Column(elem_classes="column-container"):
55
+ video_output = gr.Video(label="Generated Video")
56
+ log_output = gr.Textbox(label="Logs", lines=3, max_lines=20)
57
+
58
+ # Model Parameters
59
+ with gr.Row(elem_classes="slider-row"):
60
+ with gr.Group(elem_classes="slider-group"):
61
+ gr.Markdown("### Model Parameters")
62
+ num_steps = gr.Slider(label="Number of Steps", minimum=10, maximum=100, step=1, value=50)
63
+ guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=50.0, step=0.1, value=30.0)
64
+ lora_rank = gr.Slider(label="LoRA Rank", minimum=16, maximum=128, step=8, value=96)
65
+ lora_scale = gr.Slider(label="LoRA Scale", minimum=0.1, maximum=1.0, step=0.1, value=0.7)
66
+ noise_prior = gr.Slider(label="Noise Prior", minimum=0.0, maximum=1.0, step=0.01, value=0.1)
67
+
68
+ # Example Buttons Section
69
+ gr.Markdown("## Example Configurations")
70
+ example_base_path = "apps/assets/examples/zeroscope_v2_576w-Ghibli-LoRA"
71
+ example_buttons = []
72
+ configs = []
73
+
74
+ for i in range(1, 5):
75
+ example_dir = os.path.join(example_base_path, str(i))
76
+ config_path = os.path.join(example_dir, "config.json")
77
+ if os.path.exists(config_path):
78
+ with open(config_path, "r") as f:
79
+ config = json.load(f)
80
+ video_path = os.path.join(example_dir, config["video"])
81
+ if os.path.exists(video_path):
82
+ configs.append((config, video_path))
83
+ example_buttons.append(gr.Button(f"Load Example {i}"))
84
+
85
+ def create_example_fn(config, video_path):
86
+ def load_example():
87
+ return [
88
+ "./ckpts/zeroscope_v2_576w", # model_path
89
+ "./ckpts/zeroscope_v2_576w-Ghibli-LoRA", # checkpoint_folder
90
+ config.get("prompt", ""),
91
+ config.get("negative-prompt", ""),
92
+ config.get("width", 512),
93
+ config.get("height", 512),
94
+ config.get("num-frames", 16),
95
+ config.get("num-steps", 50),
96
+ config.get("guidance_scale", 30.0),
97
+ config.get("fps", 16),
98
+ config.get("lora_rank", 96),
99
+ config.get("lora_scale", 0.7),
100
+ config.get("noise_prior", 0.1),
101
+ config.get("seed", 100),
102
+ video_path, # video_output
103
+ f"Loaded example with prompt: {config.get('prompt', '')}" # log_output
104
+ ]
105
+ return load_example
106
+
107
+ for btn, (config, video_path) in zip(example_buttons, configs):
108
+ btn.click(
109
+ fn=create_example_fn(config, video_path),
110
+ inputs=[],
111
+ outputs=[
112
+ model_path, checkpoint_folder, prompt, negative_prompt,
113
+ width, height, num_frames, num_steps, guidance_scale,
114
+ fps, lora_rank, lora_scale, noise_prior, seed,
115
+ video_output, log_output
116
+ ]
117
+ )
118
+
119
+ generate_btn.click(
120
+ fn=run_inference,
121
+ inputs=[
122
+ model_path, checkpoint_folder, prompt, negative_prompt,
123
+ width, height, num_frames, num_steps, guidance_scale,
124
+ fps, lora_rank, lora_scale, noise_prior, seed
125
+ ],
126
+ outputs=[video_output, log_output]
127
+ )
128
+
129
+ gr.Markdown("""
130
+ This repository is trained from [![GitHub Repo](https://img.shields.io/badge/GitHub-danhtran2mind%2FMotionDirector-blue?style=flat)](https://github.com/danhtran2mind/MotionDirector), a fork of [![GitHub Repo](https://img.shields.io/badge/GitHub-showlab%2FMotionDirector-blue?style=flat)](https://github.com/showlab/MotionDirector), with numerous bug fixes and rewritten code for improved performance and stability.
131
+ """)
132
+ return app
133
+
134
+ if __name__ == "__main__":
135
+ app = create_app()
136
+ app.launch()
apps/gradio_app/__init__.py ADDED
File without changes
apps/gradio_app/abc.py ADDED
File without changes
apps/gradio_app/inference.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+ import uuid
6
+ import torch
7
+
8
+ # Append the current directory to sys.path
9
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ def run_setup_script():
12
+ setup_script = os.path.join(os.path.dirname(__file__), "setup_scripts.py")
13
+ try:
14
+ result = subprocess.run(["python", setup_script], capture_output=True, text=True, check=True)
15
+ return result.stdout
16
+ except subprocess.CalledProcessError as e:
17
+ return f"Setup script failed: {e.stderr}"
18
+
19
+ def run_inference(
20
+ model_path="./ckpts/zeroscope_v2_576w",
21
+ checkpoint_folder="./ckpts/zeroscope_v2_576w-Ghibli-LoRA",
22
+ prompt="Studio Ghibli style. Two women walk down coastal village path toward sea, passing colorful houses, sailboats visible.",
23
+ negative_prompt="ugly, noise, fragment, blur, static video",
24
+ width=256,
25
+ height=256,
26
+ num_frames=8,
27
+ num_steps=30,
28
+ guidance_scale=30.0,
29
+ fps=8,
30
+ lora_rank=32,
31
+ lora_scale=0.7,
32
+ noise_prior=0.1,
33
+ # device="cuda",
34
+ seed=100
35
+ ):
36
+ print("Start Inference")
37
+ output_dir = "apps/gradio_app/temp_data"
38
+ os.makedirs(output_dir, exist_ok=True)
39
+
40
+ # Get list of files in output_dir
41
+ for file_name in os.listdir(output_dir):
42
+ # Check if file ends with .mp4
43
+ if file_name.endswith(".mp4"):
44
+ # Remove the file
45
+ os.remove(os.path.join(output_dir, file_name))
46
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
47
+ command = [
48
+ "python", "src/third_party/MotionDirector/main_inference.py",
49
+ "--model", model_path,
50
+ "--checkpoint_folder", checkpoint_folder,
51
+ "--prompt", prompt,
52
+ "--negative-prompt", negative_prompt,
53
+ "--width", str(width),
54
+ "--height", str(height),
55
+ "--num-frames", str(num_frames),
56
+ "--num-steps", str(num_steps),
57
+ "--guidance-scale", str(guidance_scale),
58
+ "--fps", str(fps),
59
+ "--lora_rank", str(lora_rank),
60
+ "--lora_scale", str(lora_scale),
61
+ "--noise_prior", str(noise_prior),
62
+ "--device", device,
63
+ "--seed", str(seed),
64
+ "--output_dir", output_dir,
65
+ "--no-prompt-name"
66
+ ]
67
+
68
+ # Use Popen to execute the command
69
+ process = subprocess.Popen(
70
+ command,
71
+ stdout=subprocess.PIPE,
72
+ stderr=subprocess.PIPE,
73
+ text=True,
74
+ bufsize=1 # Line buffering
75
+ )
76
+
77
+ # Read output line-by-line in real-time
78
+ output_lines = []
79
+ try:
80
+ for line in process.stdout:
81
+ output_lines.append(line.strip())
82
+ except Exception as e:
83
+ return None, f"Error reading output: {str(e)}"
84
+
85
+ # Capture stderr and wait for process to complete
86
+ stderr_output = process.communicate()[1]
87
+ if process.returncode != 0:
88
+ return None, f"Error: {stderr_output.strip()}"
89
+
90
+ # Check for MP4 files in output directory
91
+ output_file = [f for f in os.listdir(output_dir) if f.lower().endswith('.mp4')]
92
+ if output_file:
93
+ output_path = os.path.join(output_dir, output_file[-1])
94
+ if os.path.exists(output_path):
95
+ return output_path, "\n".join(output_lines)
96
+ else:
97
+ return None, f"Video file not found at {output_path}\nLogs:\n" + "\n".join(output_lines)
98
+ return None, f"No MP4 files found in {output_dir}\nLogs:\n" + "\n".join(output_lines)
99
+
100
+ if __name__ == "__main__":
101
+ # Example usage
102
+ video_path, logs = run_inference()
103
+ print(f"Generated Video: {video_path}")
104
+ print(f"Logs: {logs}")
apps/gradio_app/new-inference.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+ import uuid
6
+ import torch
7
+
8
+ # Append the current directory to sys.path
9
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ def run_setup_script():
12
+ setup_script = os.path.join(os.path.dirname(__file__), "setup_scripts.py")
13
+ try:
14
+ result = subprocess.run(["python", setup_script], capture_output=True, text=True, check=True)
15
+ return result.stdout
16
+ except subprocess.CalledProcessError as e:
17
+ return f"Setup script failed: {e.stderr}"
18
+
19
+ def run_inference(
20
+ model_path="./ckpts/zeroscope_v2_576w",
21
+ checkpoint_folder="./ckpts/zeroscope_v2_576w-Ghibli-LoRA",
22
+ prompt="Studio Ghibli style. Two women walk down coastal village path toward sea, passing colorful houses, sailboats visible.",
23
+ negative_prompt="ugly, noise, fragment, blur, static video",
24
+ width=256,
25
+ height=256,
26
+ num_frames=8,
27
+ num_steps=30,
28
+ guidance_scale=30.0,
29
+ fps=8,
30
+ lora_rank=32,
31
+ lora_scale=0.7,
32
+ noise_prior=0.1,
33
+ device="cuda",
34
+ seed=100
35
+ ):
36
+ print("Start Inference")
37
+ output_dir = "apps/gradio_app/temp_data"
38
+ os.makedirs(output_dir, exist_ok=True)
39
+
40
+ # Get list of files in output_dir
41
+ for file_name in os.listdir(output_dir):
42
+ # Check if file ends with .mp4
43
+ if file_name.endswith(".mp4"):
44
+ # Remove the file
45
+ os.remove(os.path.join(output_dir, file_name))
46
+
47
+ command = [
48
+ "python", "src/third_party/MotionDirector/main_inference.py",
49
+ "--model", model_path,
50
+ "--checkpoint_folder", checkpoint_folder,
51
+ "--prompt", prompt,
52
+ "--negative-prompt", negative_prompt,
53
+ "--width", str(width),
54
+ "--height", str(height),
55
+ "--num-frames", str(num_frames),
56
+ "--num-steps", str(num_steps),
57
+ "--guidance-scale", str(guidance_scale),
58
+ "--fps", str(fps),
59
+ "--lora_rank", str(lora_rank),
60
+ "--lora_scale", str(lora_scale),
61
+ "--noise_prior", str(noise_prior),
62
+ "--device", device,
63
+ "--seed", str(seed),
64
+ "--output_dir", output_dir,
65
+ "--no-prompt-name"
66
+ ]
67
+
68
+ # Use Popen to execute the command
69
+ process = subprocess.Popen(
70
+ command,
71
+ stdout=subprocess.PIPE,
72
+ stderr=subprocess.PIPE,
73
+ text=True,
74
+ bufsize=1 # Line buffering
75
+ )
76
+
77
+ # Read output line-by-line in real-time
78
+ output_lines = []
79
+ try:
80
+ for line in process.stdout:
81
+ output_lines.append(line.strip())
82
+ except Exception as e:
83
+ return None, f"Error reading output: {str(e)}"
84
+
85
+ # Capture stderr and wait for process to complete
86
+ stderr_output = process.communicate()[1]
87
+ if process.returncode != 0:
88
+ return None, f"Error: {stderr_output.strip()}"
89
+
90
+ # Check for MP4 files in output directory
91
+ output_file = [f for f in os.listdir(output_dir) if f.lower().endswith('.mp4')]
92
+ if output_file:
93
+ output_path = os.path.join(output_dir, output_file[-1])
94
+ if os.path.exists(output_path):
95
+ return output_path, "\n".join(output_lines)
96
+ else:
97
+ return None, f"Video file not found at {output_path}\nLogs:\n" + "\n".join(output_lines)
98
+ return None, f"No MP4 files found in {output_dir}\nLogs:\n" + "\n".join(output_lines)
99
+
100
+ if __name__ == "__main__":
101
+ # Example usage
102
+ video_path, logs = run_inference(device="cpu" if not torch.cuda.is_available() else "cuda")
103
+ print(f"Generated Video: {video_path}")
104
+ print(f"Logs: {logs}")
apps/gradio_app/old-inference.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import subprocess
4
+ from pathlib import Path
5
+ import uuid
6
+ import torch
7
+
8
+ # Append the current directory to sys.path
9
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ def run_setup_script():
12
+ setup_script = os.path.join(os.path.dirname(__file__), "setup_scripts.py")
13
+ try:
14
+ result = subprocess.run(["python", setup_script], capture_output=True, text=True, check=True)
15
+ return result.stdout
16
+ except subprocess.CalledProcessError as e:
17
+ return f"Setup script failed: {e.stderr}"
18
+
19
+ def run_inference(
20
+ model_path="./ckpts/zeroscope_v2_576w",
21
+ checkpoint_folder="./ckpts/zeroscope_v2_576w-Ghibli-LoRA",
22
+ prompt="Studio Ghibli style. Two women walk down coastal village path toward sea, passing colorful houses, sailboats visible.",
23
+ negative_prompt="ugly, noise, fragment, blur, static video",
24
+ width=512,
25
+ height=512,
26
+ num_frames=16,
27
+ num_steps=50,
28
+ guidance_scale=30.0,
29
+ fps=16,
30
+ lora_rank=96,
31
+ lora_scale=0.7,
32
+ noise_prior=0.1,
33
+ device="cuda",
34
+ seed=100
35
+ ):
36
+ output_dir = "apps/gradio_app/temp_data"
37
+ os.makedirs(output_dir, exist_ok=True)
38
+
39
+ command = [
40
+ "python", "src/third_party/MotionDirector/main_inference.py",
41
+ "--model", model_path,
42
+ "--checkpoint_folder", checkpoint_folder,
43
+ "--prompt", prompt,
44
+ "--negative-prompt", negative_prompt,
45
+ "--width", str(width),
46
+ "--height", str(height),
47
+ "--num-frames", str(num_frames),
48
+ "--num-steps", str(num_steps),
49
+ "--guidance-scale", str(guidance_scale),
50
+ "--fps", str(fps),
51
+ "--lora_rank", str(lora_rank),
52
+ "--lora_scale", str(lora_scale),
53
+ "--noise_prior", str(noise_prior),
54
+ "--device", device,
55
+ "--seed", str(seed),
56
+ "--output_dir", output_dir,
57
+ "--no-prompt-name"
58
+ ]
59
+
60
+ output_file = [f for f in os.listdir(output_dir) if f.lower().endswith('.mp4')]
61
+ print(os.path.join(output_dir, output_file[0]) if output_file else "No MP4 files found.")
62
+
63
+ try:
64
+ result = subprocess.run(command, capture_output=True, text=True, check=True)
65
+ return str(output_file), result.stdout
66
+ except subprocess.CalledProcessError as e:
67
+ return None, f"Error: {e.stderr}"
68
+
69
+ if __name__ == "__main__":
70
+ # Example usage
71
+ video, logs = run_inference(device="cpu" if not torch.cuda.is_available() else "cuda")
72
+ print(f"Generated Video: {video}")
73
+ print(f"Logs: {logs}")
apps/gradio_app/setup_scripts.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import os
4
+
5
+ def run_script(script_path):
6
+ """
7
+ Run a Python script using subprocess and handle potential errors.
8
+ Returns True if successful, False otherwise.
9
+ """
10
+ try:
11
+ result = subprocess.run(
12
+ [sys.executable, script_path],
13
+ check=True,
14
+ text=True,
15
+ capture_output=True
16
+ )
17
+ print(f"Successfully executed {script_path}")
18
+ print(result.stdout)
19
+ return True
20
+ except subprocess.CalledProcessError as e:
21
+ print(f"Error executing {script_path}:")
22
+ print(e.stderr)
23
+ return False
24
+ except FileNotFoundError:
25
+ print(f"Script not found: {script_path}")
26
+ return False
27
+
28
+ def main():
29
+ """
30
+ Main function to execute setup_third_party.py and download_ckpts.py in sequence.
31
+ """
32
+ scripts_dir = "scripts"
33
+ scripts = [
34
+ os.path.join(scripts_dir, "setup_third_party.py"),
35
+ os.path.join(scripts_dir, "download_ckpts.py")
36
+ ]
37
+
38
+ for script in scripts:
39
+ print(f"Start running {script}\n")
40
+ if not run_script(script):
41
+ print(f"Stopping execution due to error in {script}")
42
+ sys.exit(1)
43
+ print(f"Completed {script}\n")
44
+
45
+ if __name__ == "__main__":
46
+ main()
apps/gradio_app/static/__init__.py ADDED
File without changes
apps/gradio_app/static/scripts.js ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document.addEventListener('DOMContentLoaded', () => {
2
+ // Add loading animation to generate button
3
+ const generateBtn = document.querySelector('.generate-btn');
4
+ if (generateBtn) {
5
+ generateBtn.addEventListener('click', () => {
6
+ generateBtn.textContent = 'Generating...';
7
+ generateBtn.disabled = true;
8
+ generateBtn.style.opacity = '0.7';
9
+
10
+ // Reset button after 2 seconds (simulating async operation)
11
+ setTimeout(() => {
12
+ generateBtn.textContent = 'Generate Video';
13
+ generateBtn.disabled = false;
14
+ generateBtn.style.opacity = '1';
15
+ }, 2000);
16
+ });
17
+ }
18
+
19
+ // Add input validation feedback
20
+ const inputs = document.querySelectorAll('input[type="text"]');
21
+ inputs.forEach(input => {
22
+ input.addEventListener('input', () => {
23
+ if (input.value.trim() === '') {
24
+ input.style.borderColor = '#e53e3e';
25
+ } else {
26
+ input.style.borderColor = '#4c51bf';
27
+ }
28
+ });
29
+ });
30
+
31
+ // Add subtle animation to sliders
32
+ const sliders = document.querySelectorAll('input[type="range"]');
33
+ sliders.forEach(slider => {
34
+ slider.addEventListener('input', () => {
35
+ slider.style.transform = 'scale(1.02)';
36
+ setTimeout(() => {
37
+ slider.style.transform = 'scale(1)';
38
+ }, 200);
39
+ });
40
+ });
41
+
42
+ // Auto-resize textarea
43
+ const textarea = document.querySelector('textarea');
44
+ if (textarea) {
45
+ textarea.addEventListener('input', () => {
46
+ textarea.style.height = 'auto';
47
+ textarea.style.height = `${textarea.scrollHeight}px`;
48
+ });
49
+ }
50
+ });
apps/gradio_app/static/styles.css ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ :root {
2
+ --primary-color: #007bff;
3
+ --secondary-color: #6c757d;
4
+ --background-light: #f8f9fa;
5
+ --background-dark: #1a1a1a;
6
+ --text-light: #212529;
7
+ --text-dark: #e9ecef;
8
+ --accent-color: #28a745;
9
+ --border-color-light: #dee2e6;
10
+ --border-color-dark: #343a40;
11
+ --button-hover-light: #0056b3;
12
+ --button-hover-dark: #4dabf7;
13
+ --shadow-light: rgba(0, 0, 0, 0.1);
14
+ --shadow-dark: rgba(255, 255, 255, 0.1);
15
+ }
16
+
17
+ body {
18
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
19
+ margin: 0;
20
+ padding: 20px;
21
+ transition: background-color 0.3s, color 0.3s;
22
+ }
23
+
24
+ .light-theme {
25
+ background-color: var(--background-light);
26
+ color: var(--text-light);
27
+ }
28
+
29
+ .dark-theme {
30
+ background-color: var(--background-dark);
31
+ color: var(--text-dark);
32
+ }
33
+
34
+ .row-container {
35
+ display: flex;
36
+ gap: 20px;
37
+ margin-bottom: 20px;
38
+ }
39
+
40
+ .column-container {
41
+ background: var(--background-light);
42
+ border: 1px solid var(--border-color-light);
43
+ border-radius: 8px;
44
+ padding: 20px;
45
+ box-shadow: 0 2px 4px var(--shadow-light);
46
+ transition: background-color 0.3s, border-color 0.3s, box-shadow 0.3s;
47
+ }
48
+
49
+ .dark-theme .column-container {
50
+ background: var(--background-dark);
51
+ border-color: var(--border-color-dark);
52
+ box-shadow: 0 2px 4px var(--shadow-dark);
53
+ }
54
+
55
+ .slider-row {
56
+ margin: 10px 0;
57
+ }
58
+
59
+ .slider-group {
60
+ background: rgba(255, 255, 255, 0.5);
61
+ border-radius: 6px;
62
+ padding: 15px;
63
+ border: 1px solid var(--border-color-light);
64
+ transition: background-color 0.3s, border-color 0.3s;
65
+ }
66
+
67
+ .dark-theme .slider-group {
68
+ background: rgba(0, 0, 0, 0.3);
69
+ border-color: var(--border-color-dark);
70
+ }
71
+
72
+ .generate-btn {
73
+ background-color: var(--primary-color);
74
+ color: white;
75
+ border: none;
76
+ padding: 12px 24px;
77
+ border-radius: 6px;
78
+ font-size: 16px;
79
+ font-weight: 500;
80
+ cursor: pointer;
81
+ transition: background-color 0.3s, transform 0.2s;
82
+ }
83
+
84
+ .generate-btn:hover {
85
+ background-color: var(--button-hover-light);
86
+ transform: translateY(-2px);
87
+ }
88
+
89
+ .dark-theme .generate-btn {
90
+ background-color: var(--primary-color);
91
+ }
92
+
93
+ .dark-theme .generate-btn:hover {
94
+ background-color: var(--button-hover-dark);
95
+ }
96
+
97
+ .gr-button, .gr-textbox, .gr-slider, .gr-dropdown, .gr-number, .gr-video, .gr-markdown {
98
+ border-radius: 6px !important;
99
+ border: 1px solid var(--border-color-light) !important;
100
+ transition: border-color 0.3s, background-color 0.3s;
101
+ }
102
+
103
+ .dark-theme .gr-button,
104
+ .dark-theme .gr-textbox,
105
+ .dark-theme .gr-slider,
106
+ .dark-theme .gr-dropdown,
107
+ .dark-theme .gr-number,
108
+ .dark-theme .gr-video,
109
+ .dark-theme .gr-markdown {
110
+ border-color: var(--border-color-dark) !important;
111
+ background-color: rgba(255, 255, 255, 0.05) !important;
112
+ }
113
+
114
+ .gr-textbox input, .gr-number input {
115
+ background: transparent !important;
116
+ color: inherit !important;
117
+ font-size: 14px;
118
+ }
119
+
120
+ .gr-slider input[type="range"] {
121
+ accent-color: var(--primary-color);
122
+ }
123
+
124
+ .gr-dropdown select {
125
+ background: transparent !important;
126
+ color: inherit !important;
127
+ padding: 8px;
128
+ }
129
+
130
+ .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {
131
+ font-weight: 600;
132
+ margin-bottom: 10px;
133
+ color: var(--primary-color);
134
+ }
135
+
136
+ .dark-theme .gr-markdown h1,
137
+ .dark-theme .gr-markdown h2,
138
+ .dark-theme .gr-markdown h3 {
139
+ color: var(--button-hover-dark);
140
+ }
141
+
142
+ @media (max-width: 768px) {
143
+ .row-container {
144
+ flex-direction: column;
145
+ }
146
+
147
+ .column-container {
148
+ padding: 15px;
149
+ }
150
+
151
+ .generate-btn {
152
+ width: 100%;
153
+ }
154
+ }
assets/examples/zeroscope_v2_576w-Ghibli-LoRA/1/A_dog_is_running_with_Ghibli_style_42.gif ADDED

Git LFS Details

  • SHA256: ca16026bdc19faed0d40507c059fc882455eee05f106a1ecca9e4438a366f68e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.38 MB
assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/A_girl_is_walking_with_Ghibli_style_0.gif ADDED

Git LFS Details

  • SHA256: 0254e790e96f25d81910e2eedaedb628931294a38a9a2914f12a12694933c2a8
  • Pointer size: 131 Bytes
  • Size of remote file: 315 kB
assets/examples/zeroscope_v2_576w-Ghibli-LoRA/2/config.json CHANGED
@@ -4,7 +4,7 @@
4
  "height": 384,
5
  "width": 384,
6
  "num-frames": 28,
7
- "num-steps": 50
8
  "guidance_scale": 15,
9
  "fps": 16,
10
  "lora_rank": 128,
 
4
  "height": 384,
5
  "width": 384,
6
  "num-frames": 28,
7
+ "num-steps": 50,
8
  "guidance_scale": 15,
9
  "fps": 16,
10
  "lora_rank": 128,
assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/Studio_Ghibli_style_Young_man_contemplates_walks_away_from_ivy-covered_yellow_building_12345.gif ADDED

Git LFS Details

  • SHA256: 130fffec21d318bf17095cb30f85fd36f7a639c70b6bbfa9333db96e254be0a1
  • Pointer size: 131 Bytes
  • Size of remote file: 921 kB
assets/examples/zeroscope_v2_576w-Ghibli-LoRA/3/config.json CHANGED
@@ -4,7 +4,7 @@
4
  "height": 384,
5
  "width": 384,
6
  "num-frames": 28,
7
- "num-steps": 50
8
  "guidance_scale": 15,
9
  "fps": 16,
10
  "lora_rank": 32,
 
4
  "height": 384,
5
  "width": 384,
6
  "num-frames": 28,
7
+ "num-steps": 50,
8
  "guidance_scale": 15,
9
  "fps": 16,
10
  "lora_rank": 32,
assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/Studio_Ghibli_style_Two_women_walk_down_coastal_village_path_toward_sea_passing_colorful_houses_sailboats_visible_100.gif ADDED

Git LFS Details

  • SHA256: 1a81aa08926211ac88420151497ca422469cc102a332cfd5b3865693e7ee005a
  • Pointer size: 132 Bytes
  • Size of remote file: 1.07 MB
assets/examples/zeroscope_v2_576w-Ghibli-LoRA/4/config.json CHANGED
@@ -4,7 +4,7 @@
4
  "height": 512,
5
  "width": 512,
6
  "num-frames": 16,
7
- "num-steps": 50
8
  "guidance_scale": 30,
9
  "fps": 16,
10
  "lora_rank": 96,
 
4
  "height": 512,
5
  "width": 512,
6
  "num-frames": 16,
7
+ "num-steps": 50,
8
  "guidance_scale": 30,
9
  "fps": 16,
10
  "lora_rank": 96,
configs/config_multi_videos.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pretrained diffusers model path.
2
+ pretrained_model_path: "./ckpts/zeroscope_v2_576w"
3
+ # pretrained_model_path: "./ckpts/text-to-video-ms-1.7b"
4
+ # The folder where your training outputs will be placed.
5
+ output_dir: "./zeroscope_v2_576w-Ghibli-LoRA"
6
+ # resume_step: 500
7
+ # resume_from_checkpoint: "./zeroscope_v2_576w-Scenery_Anime_Bright-lora/train_2025-07-10T13-46-57"
8
+ # lora_path: "zeroscope_v2_576w-Scenery_Anime_Bright-lora/checkpoint-500" # This argument is used for training resumption
9
+ # lora_path: zeroscope_v2_576w-Ghibli-LoRA/train_2025-07-13T06-46-47/checkpoint-200
10
+
11
+ dataset_types:
12
+ - 'folder'
13
+
14
+ # Caches the latents (Frames-Image -> VAE -> Latent) to a HDD or SDD.
15
+ # The latents will be saved under your training folder, and loaded automatically for training.
16
+ # This both saves memory and speeds up training and takes very little disk space.
17
+ cache_latents: True
18
+
19
+
20
+ # If you have cached latents set to `True` and have a directory of cached latents,
21
+ # you can skip the caching process and load previously saved ones.
22
+ cached_latent_dir: null #/path/to/cached_latents
23
+ # cached_latent_dir: zeroscope_v2_576w-Ghibli-LoRA/train_2025-07-13T06-46-47/cached_latents
24
+
25
+ # Use LoRA for the UNET model.
26
+ use_unet_lora: True
27
+
28
+ # LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.
29
+ # See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
30
+ lora_unet_dropout: 0.1
31
+
32
+ # Choose whether or not ito save the full pretrained model weights for both checkpoints and after training.
33
+ # The only time you want this off is if you're doing full LoRA training.
34
+ save_pretrained_model: True
35
+ # save_pretrained_model: True
36
+
37
+ # The rank for LoRA training. With ModelScope, the maximum should be 1024.
38
+ # VRAM increases with higher rank, lower when decreased.
39
+ lora_rank: 16
40
+
41
+ # Training data parameters
42
+ train_data:
43
+ # 'multiple videos'
44
+ path: "./data/ghibli/videos"
45
+ # The width and height in which you want your training data to be resized to.
46
+ width: 384
47
+ height: 384
48
+
49
+ # This will find the closest aspect ratio to your input width and height.
50
+ # For example, 512x512 width and height with a video of resolution 1280x720 will be resized to 512x256
51
+ use_bucketing: True
52
+ gradient_accumulation_steps: 2
53
+ batch_size: 1
54
+ # The start frame index where your videos should start (Leave this at one for json and folder based training).
55
+ sample_start_idx: 1
56
+
57
+ # Used for 'folder'. The rate at which your frames are sampled. Does nothing for 'json' and 'single_video' dataset.
58
+ fps: 16
59
+
60
+ # For 'single_video' and 'json'. The number of frames to "step" (1,2,3,4) (frame_step=2) -> (1,3,5,7, ...).
61
+ frame_step: 1
62
+
63
+ # The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size).
64
+ n_sample_frames: 24
65
+
66
+ # The prompt when using a a single video file
67
+ # fallback_prompt: "A person is riding a bicycle."
68
+
69
+ # Validation data parameters.
70
+ validation_data:
71
+ # A custom prompt that is different from your training dataset.
72
+ prompt:
73
+ - "Studio Ghibli style. The video showcases a vibrant and lively scene set in the early."
74
+ - "Studio Ghibli style. A woman with black hair is holding a gun in her hand."
75
+
76
+ # Whether or not to sample preview during training (Requires more VRAM).
77
+ # sample_preview: True
78
+ sample_preview: False
79
+
80
+ # The number of frames to sample during validation.
81
+ num_frames: 24
82
+
83
+ # Height and width of validation sample.
84
+ width: 384
85
+ height: 384
86
+
87
+ # Number of inference steps when generating the video.
88
+ num_inference_steps: 15
89
+
90
+ # CFG scale
91
+ guidance_scale: 12
92
+
93
+ # scale of spatial LoRAs, default is 0
94
+ spatial_scale: 0
95
+
96
+ # scale of noise prior, i.e. the scale of inversion noises
97
+ noise_prior: 0
98
+
99
+ use_offset_noise: False
100
+ offset_noise_strength: 0.
101
+
102
+ # Learning rate for AdamW
103
+ learning_rate: 5e-4
104
+
105
+ # Weight decay. Higher = more regularization. Lower = closer to dataset.
106
+ adam_weight_decay: 1e-4
107
+
108
+ # Maximum number of train steps. Model is saved after training.
109
+ max_train_steps: 5000
110
+
111
+ # Saves a model every nth step.
112
+ checkpointing_steps: 5000
113
+
114
+ # How many steps to do for validation if sample_preview is enabled.
115
+ validation_steps: 5000
116
+
117
+ # Whether or not we want to use mixed precision with accelerate
118
+ mixed_precision: "fp16"
119
+ # mixed_precision: "no"
120
+
121
+ # Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
122
+ # If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
123
+ gradient_checkpointing: True
124
+ text_encoder_gradient_checkpointing: True
125
+
126
+ # Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
127
+ enable_xformers_memory_efficient_attention: True
128
+ use_8bit_adam: True
129
+
130
+ # Use scaled dot product attention (Only available with >= Torch 2.0)
131
+ enable_torch_2_attn: True
notebooks/zeroscope_v2_576w_Ghibli_LoRA-Inference.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/zeroscope_v2_576w_Ghibli_LoRA-Training.ipynb ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
8
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
9
+ "execution": {
10
+ "iopub.execute_input": "2025-07-16T05:27:38.872329Z",
11
+ "iopub.status.busy": "2025-07-16T05:27:38.872068Z",
12
+ "iopub.status.idle": "2025-07-16T05:29:50.846263Z",
13
+ "shell.execute_reply": "2025-07-16T05:29:50.845486Z",
14
+ "shell.execute_reply.started": "2025-07-16T05:27:38.872302Z"
15
+ },
16
+ "trusted": true
17
+ },
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "/content\n",
24
+ "Cloning into 'MotionDirector'...\n",
25
+ "remote: Enumerating objects: 657, done.\u001b[K\n",
26
+ "remote: Counting objects: 100% (163/163), done.\u001b[K\n",
27
+ "remote: Compressing objects: 100% (82/82), done.\u001b[K\n",
28
+ "remote: Total 657 (delta 108), reused 88 (delta 81), pack-reused 494 (from 1)\u001b[K\n",
29
+ "Receiving objects: 100% (657/657), 132.29 MiB | 50.34 MiB/s, done.\n",
30
+ "Resolving deltas: 100% (349/349), done.\n",
31
+ "/content/MotionDirector\n",
32
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
33
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
34
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
35
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
36
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.2/87.2 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
37
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
38
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m362.1/362.1 kB\u001b[0m \u001b[31m24.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
39
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
40
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m90.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
41
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
42
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m44.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
43
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
44
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
45
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
46
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
47
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
48
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m73.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
49
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.6/13.6 MB\u001b[0m \u001b[31m62.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
50
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.9/72.9 MB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
51
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.8/44.8 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
52
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.0/54.0 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
53
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m35.6/35.6 MB\u001b[0m \u001b[31m47.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
54
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.9/294.9 kB\u001b[0m \u001b[31m18.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
55
+ "\u001b[?25h Building wheel for deepspeed (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
56
+ " Building wheel for lora_diffusion (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
57
+ " Building wheel for loralib (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
58
+ " Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
59
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
60
+ "bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.\n",
61
+ "google-api-core 1.34.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5, but you have protobuf 4.25.8 which is incompatible.\n",
62
+ "pandas-gbq 0.29.1 requires google-api-core<3.0.0,>=2.10.2, but you have google-api-core 1.34.1 which is incompatible.\n",
63
+ "google-cloud-storage 2.19.0 requires google-api-core<3.0.0dev,>=2.15.0, but you have google-api-core 1.34.1 which is incompatible.\n",
64
+ "dataproc-spark-connect 0.7.5 requires google-api-core>=2.19, but you have google-api-core 1.34.1 which is incompatible.\n",
65
+ "bigframes 2.8.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.25.0 which is incompatible.\n",
66
+ "bigframes 2.8.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.\u001b[0m\u001b[31m\n",
67
+ "\u001b[0m"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "# install packages\n",
73
+ "%cd /content\n",
74
+ "!git clone https://github.com/danhtran2mind/MotionDirector\n",
75
+ "%cd MotionDirector\n",
76
+ "!pip install -r requirements.txt -q"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 2,
82
+ "metadata": {
83
+ "execution": {
84
+ "iopub.execute_input": "2025-07-16T05:29:50.848033Z",
85
+ "iopub.status.busy": "2025-07-16T05:29:50.847771Z",
86
+ "iopub.status.idle": "2025-07-16T05:29:54.955247Z",
87
+ "shell.execute_reply": "2025-07-16T05:29:54.954373Z",
88
+ "shell.execute_reply.started": "2025-07-16T05:29:50.848010Z"
89
+ },
90
+ "trusted": true
91
+ },
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.8/235.8 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
98
+ "\u001b[?25h"
99
+ ]
100
+ }
101
+ ],
102
+ "source": [
103
+ "!pip install -q bitsandbytes unidecode"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 3,
109
+ "metadata": {
110
+ "execution": {
111
+ "iopub.execute_input": "2025-07-16T05:29:54.956819Z",
112
+ "iopub.status.busy": "2025-07-16T05:29:54.956511Z",
113
+ "iopub.status.idle": "2025-07-16T05:29:54.963707Z",
114
+ "shell.execute_reply": "2025-07-16T05:29:54.962891Z",
115
+ "shell.execute_reply.started": "2025-07-16T05:29:54.956786Z"
116
+ },
117
+ "trusted": true
118
+ },
119
+ "outputs": [],
120
+ "source": [
121
+ "import os\n",
122
+ "import shutil\n",
123
+ "import random\n",
124
+ "\n",
125
+ "def copy_file_pairs(source_dir, dest_dir, max_pairs=20, seed=None):\n",
126
+ " if seed is not None:\n",
127
+ " random.seed(seed)\n",
128
+ " os.makedirs(dest_dir, exist_ok=True)\n",
129
+ " mp4_files = [f for f in os.listdir(source_dir) if f.endswith('.mp4')]\n",
130
+ " selected_mp4_files = random.sample(mp4_files, min(len(mp4_files), max_pairs))\n",
131
+ " for mp4 in selected_mp4_files:\n",
132
+ " base = os.path.splitext(mp4)[0]\n",
133
+ " txt = f\"{base}.txt\"\n",
134
+ " if os.path.exists(os.path.join(source_dir, txt)):\n",
135
+ " shutil.copy2(os.path.join(source_dir, mp4), os.path.join(dest_dir, mp4))\n",
136
+ " shutil.copy2(os.path.join(source_dir, txt), os.path.join(dest_dir, txt))\n",
137
+ " return len(selected_mp4_files)\n"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 4,
143
+ "metadata": {
144
+ "execution": {
145
+ "iopub.execute_input": "2025-07-16T05:29:54.965605Z",
146
+ "iopub.status.busy": "2025-07-16T05:29:54.965374Z",
147
+ "iopub.status.idle": "2025-07-16T05:30:00.766653Z",
148
+ "shell.execute_reply": "2025-07-16T05:30:00.766019Z",
149
+ "shell.execute_reply.started": "2025-07-16T05:29:54.965578Z"
150
+ },
151
+ "trusted": true
152
+ },
153
+ "outputs": [
154
+ {
155
+ "data": {
156
+ "application/vnd.jupyter.widget-view+json": {
157
+ "model_id": "2b0f51df7a5047bd8d404fca30add463",
158
+ "version_major": 2,
159
+ "version_minor": 0
160
+ },
161
+ "text/plain": [
162
+ "Fetching 2 files: 0%| | 0/2 [00:00<?, ?it/s]"
163
+ ]
164
+ },
165
+ "metadata": {},
166
+ "output_type": "display_data"
167
+ },
168
+ {
169
+ "data": {
170
+ "application/vnd.jupyter.widget-view+json": {
171
+ "model_id": "b800f95e15fb4064854902602aa3a3dd",
172
+ "version_major": 2,
173
+ "version_minor": 0
174
+ },
175
+ "text/plain": [
176
+ ".gitattributes: 0.00B [00:00, ?B/s]"
177
+ ]
178
+ },
179
+ "metadata": {},
180
+ "output_type": "display_data"
181
+ },
182
+ {
183
+ "data": {
184
+ "application/vnd.jupyter.widget-view+json": {
185
+ "model_id": "6a6cc4ba523f4a18adbc9ffaa3525340",
186
+ "version_major": 2,
187
+ "version_minor": 0
188
+ },
189
+ "text/plain": [
190
+ "studio_ghibli_wan14b_t2v_v01_dataset.zip: 0%| | 0.00/300M [00:00<?, ?B/s]"
191
+ ]
192
+ },
193
+ "metadata": {},
194
+ "output_type": "display_data"
195
+ }
196
+ ],
197
+ "source": [
198
+ "import os\n",
199
+ "\n",
200
+ "from huggingface_hub import snapshot_download\n",
201
+ "\n",
202
+ "# Create directory if it doesn't exist\n",
203
+ "os.makedirs(\"data/ghibli/raw\", exist_ok=True)\n",
204
+ "\n",
205
+ "# Download the dataset using snapshot_download\n",
206
+ "snapshot_download(repo_id=\"raymondt/ghibi_t2v\", \n",
207
+ " local_dir=\"data/ghibli/raw\", \n",
208
+ " repo_type=\"dataset\")\n",
209
+ "\n",
210
+ "# Assuming the zip file is downloaded, unzip it to the target directory\n",
211
+ "import zipfile\n",
212
+ "zip_path = \"data/ghibli/raw/studio_ghibli_wan14b_t2v_v01_dataset.zip\"\n",
213
+ "extract_path = \"data/ghibli/raw\"\n",
214
+ "\n",
215
+ "with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
216
+ " zip_ref.extractall(extract_path)\n",
217
+ "\n"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 5,
223
+ "metadata": {
224
+ "execution": {
225
+ "iopub.execute_input": "2025-07-16T05:30:00.767663Z",
226
+ "iopub.status.busy": "2025-07-16T05:30:00.767419Z",
227
+ "iopub.status.idle": "2025-07-16T05:30:01.056828Z",
228
+ "shell.execute_reply": "2025-07-16T05:30:01.056142Z",
229
+ "shell.execute_reply.started": "2025-07-16T05:30:00.767643Z"
230
+ },
231
+ "trusted": true
232
+ },
233
+ "outputs": [
234
+ {
235
+ "name": "stdout",
236
+ "output_type": "stream",
237
+ "text": [
238
+ "Copied 240 pairs to data/ghibli/videos\n"
239
+ ]
240
+ }
241
+ ],
242
+ "source": [
243
+ "# Copy the videos directory to the desired location\n",
244
+ "source = \"data/ghibli/raw/videos/1920x1040\"\n",
245
+ "dest = \"data/ghibli/videos\"\n",
246
+ "\n",
247
+ "copied = copy_file_pairs(source, dest, max_pairs=240, seed=42)\n",
248
+ "print(f\"Copied {copied} pairs to {dest}\")"
249
+ ]
250
+ },
251
+ {
252
+ "cell_type": "code",
253
+ "execution_count": 6,
254
+ "metadata": {
255
+ "execution": {
256
+ "iopub.execute_input": "2025-07-16T05:30:01.057811Z",
257
+ "iopub.status.busy": "2025-07-16T05:30:01.057583Z",
258
+ "iopub.status.idle": "2025-07-16T05:30:29.547286Z",
259
+ "shell.execute_reply": "2025-07-16T05:30:29.546428Z",
260
+ "shell.execute_reply.started": "2025-07-16T05:30:01.057784Z"
261
+ },
262
+ "trusted": true
263
+ },
264
+ "outputs": [
265
+ {
266
+ "data": {
267
+ "application/vnd.jupyter.widget-view+json": {
268
+ "model_id": "98bbd6ea501745bcabdb7f89bdb7af95",
269
+ "version_major": 2,
270
+ "version_minor": 0
271
+ },
272
+ "text/plain": [
273
+ "Fetching 16 files: 0%| | 0/16 [00:00<?, ?it/s]"
274
+ ]
275
+ },
276
+ "metadata": {},
277
+ "output_type": "display_data"
278
+ },
279
+ {
280
+ "data": {
281
+ "application/vnd.jupyter.widget-view+json": {
282
+ "model_id": "6b20bb02614d4dfdb037484f1bbf06ed",
283
+ "version_major": 2,
284
+ "version_minor": 0
285
+ },
286
+ "text/plain": [
287
+ "merges.txt: 0.00B [00:00, ?B/s]"
288
+ ]
289
+ },
290
+ "metadata": {},
291
+ "output_type": "display_data"
292
+ },
293
+ {
294
+ "data": {
295
+ "application/vnd.jupyter.widget-view+json": {
296
+ "model_id": "d7d127ca77964c7a9ab58b55dced9701",
297
+ "version_major": 2,
298
+ "version_minor": 0
299
+ },
300
+ "text/plain": [
301
+ "model_index.json: 0%| | 0.00/384 [00:00<?, ?B/s]"
302
+ ]
303
+ },
304
+ "metadata": {},
305
+ "output_type": "display_data"
306
+ },
307
+ {
308
+ "data": {
309
+ "application/vnd.jupyter.widget-view+json": {
310
+ "model_id": "1b9272edd1784c12baaa2826a1effcd3",
311
+ "version_major": 2,
312
+ "version_minor": 0
313
+ },
314
+ "text/plain": [
315
+ "README.md: 0.00B [00:00, ?B/s]"
316
+ ]
317
+ },
318
+ "metadata": {},
319
+ "output_type": "display_data"
320
+ },
321
+ {
322
+ "data": {
323
+ "application/vnd.jupyter.widget-view+json": {
324
+ "model_id": "8f7a542514544640af6d140501a7a05b",
325
+ "version_major": 2,
326
+ "version_minor": 0
327
+ },
328
+ "text/plain": [
329
+ "special_tokens_map.json: 0%| | 0.00/460 [00:00<?, ?B/s]"
330
+ ]
331
+ },
332
+ "metadata": {},
333
+ "output_type": "display_data"
334
+ },
335
+ {
336
+ "data": {
337
+ "application/vnd.jupyter.widget-view+json": {
338
+ "model_id": "d33b170a93dd430386b8bd5f045f3419",
339
+ "version_major": 2,
340
+ "version_minor": 0
341
+ },
342
+ "text/plain": [
343
+ "scheduler_config.json: 0%| | 0.00/465 [00:00<?, ?B/s]"
344
+ ]
345
+ },
346
+ "metadata": {},
347
+ "output_type": "display_data"
348
+ },
349
+ {
350
+ "data": {
351
+ "application/vnd.jupyter.widget-view+json": {
352
+ "model_id": "d64f3a8914004c2283450052563fa0cd",
353
+ "version_major": 2,
354
+ "version_minor": 0
355
+ },
356
+ "text/plain": [
357
+ ".gitattributes: 0.00B [00:00, ?B/s]"
358
+ ]
359
+ },
360
+ "metadata": {},
361
+ "output_type": "display_data"
362
+ },
363
+ {
364
+ "data": {
365
+ "application/vnd.jupyter.widget-view+json": {
366
+ "model_id": "7909289ad6a747f4a6fbd81db836e7b5",
367
+ "version_major": 2,
368
+ "version_minor": 0
369
+ },
370
+ "text/plain": [
371
+ "config.json: 0%| | 0.00/609 [00:00<?, ?B/s]"
372
+ ]
373
+ },
374
+ "metadata": {},
375
+ "output_type": "display_data"
376
+ },
377
+ {
378
+ "data": {
379
+ "application/vnd.jupyter.widget-view+json": {
380
+ "model_id": "d562fad6588646f3929d7dfd805e001e",
381
+ "version_major": 2,
382
+ "version_minor": 0
383
+ },
384
+ "text/plain": [
385
+ "pytorch_model.bin: 0%| | 0.00/681M [00:00<?, ?B/s]"
386
+ ]
387
+ },
388
+ "metadata": {},
389
+ "output_type": "display_data"
390
+ },
391
+ {
392
+ "data": {
393
+ "application/vnd.jupyter.widget-view+json": {
394
+ "model_id": "0b4361967a684e7c99075ba14b7ec864",
395
+ "version_major": 2,
396
+ "version_minor": 0
397
+ },
398
+ "text/plain": [
399
+ "config.json: 0%| | 0.00/727 [00:00<?, ?B/s]"
400
+ ]
401
+ },
402
+ "metadata": {},
403
+ "output_type": "display_data"
404
+ },
405
+ {
406
+ "data": {
407
+ "application/vnd.jupyter.widget-view+json": {
408
+ "model_id": "55a17f9d827c4656b1ae272d679b0a26",
409
+ "version_major": 2,
410
+ "version_minor": 0
411
+ },
412
+ "text/plain": [
413
+ "tokenizer_config.json: 0%| | 0.00/737 [00:00<?, ?B/s]"
414
+ ]
415
+ },
416
+ "metadata": {},
417
+ "output_type": "display_data"
418
+ },
419
+ {
420
+ "data": {
421
+ "application/vnd.jupyter.widget-view+json": {
422
+ "model_id": "d5ebea88cba841d0a117156384a7af6d",
423
+ "version_major": 2,
424
+ "version_minor": 0
425
+ },
426
+ "text/plain": [
427
+ "config.json: 0%| | 0.00/636 [00:00<?, ?B/s]"
428
+ ]
429
+ },
430
+ "metadata": {},
431
+ "output_type": "display_data"
432
+ },
433
+ {
434
+ "data": {
435
+ "application/vnd.jupyter.widget-view+json": {
436
+ "model_id": "f7bc8fb35a9f44deb336bb1b109298ee",
437
+ "version_major": 2,
438
+ "version_minor": 0
439
+ },
440
+ "text/plain": [
441
+ "open_clip_pytorch_model.bin: 0%| | 0.00/1.97G [00:00<?, ?B/s]"
442
+ ]
443
+ },
444
+ "metadata": {},
445
+ "output_type": "display_data"
446
+ },
447
+ {
448
+ "data": {
449
+ "application/vnd.jupyter.widget-view+json": {
450
+ "model_id": "2ba9dca7dfb2455aaf4627d43390d550",
451
+ "version_major": 2,
452
+ "version_minor": 0
453
+ },
454
+ "text/plain": [
455
+ "diffusion_pytorch_model.bin: 0%| | 0.00/2.82G [00:00<?, ?B/s]"
456
+ ]
457
+ },
458
+ "metadata": {},
459
+ "output_type": "display_data"
460
+ },
461
+ {
462
+ "data": {
463
+ "application/vnd.jupyter.widget-view+json": {
464
+ "model_id": "aab2fd449c9b44a58ff281b30deb6e37",
465
+ "version_major": 2,
466
+ "version_minor": 0
467
+ },
468
+ "text/plain": [
469
+ "vocab.json: 0.00B [00:00, ?B/s]"
470
+ ]
471
+ },
472
+ "metadata": {},
473
+ "output_type": "display_data"
474
+ },
475
+ {
476
+ "data": {
477
+ "application/vnd.jupyter.widget-view+json": {
478
+ "model_id": "6aca918dcb4f4b4b9f6804859ea65ab6",
479
+ "version_major": 2,
480
+ "version_minor": 0
481
+ },
482
+ "text/plain": [
483
+ "text2video_pytorch_model.pth: 0%| | 0.00/2.82G [00:00<?, ?B/s]"
484
+ ]
485
+ },
486
+ "metadata": {},
487
+ "output_type": "display_data"
488
+ },
489
+ {
490
+ "data": {
491
+ "application/vnd.jupyter.widget-view+json": {
492
+ "model_id": "bf61eb3a364e455b89379e66e8f304d7",
493
+ "version_major": 2,
494
+ "version_minor": 0
495
+ },
496
+ "text/plain": [
497
+ "diffusion_pytorch_model.bin: 0%| | 0.00/167M [00:00<?, ?B/s]"
498
+ ]
499
+ },
500
+ "metadata": {},
501
+ "output_type": "display_data"
502
+ },
503
+ {
504
+ "data": {
505
+ "text/plain": [
506
+ "'/content/MotionDirector/ckpts/zeroscope_v2_576w'"
507
+ ]
508
+ },
509
+ "execution_count": 6,
510
+ "metadata": {},
511
+ "output_type": "execute_result"
512
+ }
513
+ ],
514
+ "source": [
515
+ "from huggingface_hub import snapshot_download\n",
516
+ "# Download ZeroScope model snapshot\n",
517
+ "repo_id = \"cerspense/zeroscope_v2_576w\"\n",
518
+ "snapshot_download(repo_id=repo_id,\n",
519
+ " local_dir=\"./ckpts/zeroscope_v2_576w\")"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": 9,
525
+ "metadata": {
526
+ "execution": {
527
+ "iopub.execute_input": "2025-07-15T14:12:26.903891Z",
528
+ "iopub.status.busy": "2025-07-15T14:12:26.903623Z",
529
+ "iopub.status.idle": "2025-07-15T14:12:26.907531Z",
530
+ "shell.execute_reply": "2025-07-15T14:12:26.906880Z",
531
+ "shell.execute_reply.started": "2025-07-15T14:12:26.903873Z"
532
+ },
533
+ "trusted": true
534
+ },
535
+ "outputs": [],
536
+ "source": [
537
+ "import torch\n",
538
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
539
+ "# device"
540
+ ]
541
+ },
542
+ {
543
+ "cell_type": "code",
544
+ "execution_count": null,
545
+ "metadata": {
546
+ "execution": {
547
+ "iopub.execute_input": "2025-07-16T05:35:53.783879Z",
548
+ "iopub.status.busy": "2025-07-16T05:35:53.783504Z",
549
+ "iopub.status.idle": "2025-07-16T05:35:53.791697Z",
550
+ "shell.execute_reply": "2025-07-16T05:35:53.791129Z",
551
+ "shell.execute_reply.started": "2025-07-16T05:35:53.783849Z"
552
+ },
553
+ "trusted": true
554
+ },
555
+ "outputs": [
556
+ {
557
+ "name": "stdout",
558
+ "output_type": "stream",
559
+ "text": [
560
+ "Overwriting configs/config_multi_videos.yaml\n"
561
+ ]
562
+ }
563
+ ],
564
+ "source": [
565
+ "%%writefile configs/config_multi_videos.yaml\n",
566
+ "# Pretrained diffusers model path.\n",
567
+ "pretrained_model_path: \"./ckpts/zeroscope_v2_576w\"\n",
568
+ "# pretrained_model_path: \"./ckpts/text-to-video-ms-1.7b\"\n",
569
+ "# The folder where your training outputs will be placed.\n",
570
+ "output_dir: \"./zeroscope_v2_576w-Ghibli-LoRA\"\n",
571
+ "# resume_step: 500\n",
572
+ "# resume_from_checkpoint: \"./zeroscope_v2_576w-Scenery_Anime_Bright-lora/train_2025-07-10T13-46-57\"\n",
573
+ "# lora_path: \"zeroscope_v2_576w-Scenery_Anime_Bright-lora/checkpoint-500\" # This argument is used for training resumption\n",
574
+ "# lora_path: zeroscope_v2_576w-Ghibli-LoRA/train_2025-07-13T06-46-47/checkpoint-200\n",
575
+ "\n",
576
+ "dataset_types:\n",
577
+ " - 'folder'\n",
578
+ "\n",
579
+ "# Caches the latents (Frames-Image -> VAE -> Latent) to a HDD or SDD.\n",
580
+ "# The latents will be saved under your training folder, and loaded automatically for training.\n",
581
+ "# This both saves memory and speeds up training and takes very little disk space.\n",
582
+ "cache_latents: True\n",
583
+ "\n",
584
+ "\n",
585
+ "# If you have cached latents set to `True` and have a directory of cached latents,\n",
586
+ "# you can skip the caching process and load previously saved ones.\n",
587
+ "cached_latent_dir: null #/path/to/cached_latents\n",
588
+ "# cached_latent_dir: zeroscope_v2_576w-Ghibli-LoRA/train_2025-07-13T06-46-47/cached_latents\n",
589
+ "\n",
590
+ "# Use LoRA for the UNET model.\n",
591
+ "use_unet_lora: True\n",
592
+ "\n",
593
+ "# LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.\n",
594
+ "# See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html\n",
595
+ "lora_unet_dropout: 0.1\n",
596
+ "\n",
597
+ "# Choose whether or not ito save the full pretrained model weights for both checkpoints and after training.\n",
598
+ "# The only time you want this off is if you're doing full LoRA training.\n",
599
+ "save_pretrained_model: True\n",
600
+ "# save_pretrained_model: True\n",
601
+ "\n",
602
+ "# The rank for LoRA training. With ModelScope, the maximum should be 1024.\n",
603
+ "# VRAM increases with higher rank, lower when decreased.\n",
604
+ "lora_rank: 16\n",
605
+ "\n",
606
+ "# Training data parameters\n",
607
+ "train_data:\n",
608
+ " # 'multiple videos'\n",
609
+ " path: \"./data/ghibli/videos\"\n",
610
+ " # The width and height in which you want your training data to be resized to.\n",
611
+ " width: 384\n",
612
+ " height: 384\n",
613
+ "\n",
614
+ " # This will find the closest aspect ratio to your input width and height.\n",
615
+ " # For example, 512x512 width and height with a video of resolution 1280x720 will be resized to 512x256\n",
616
+ " use_bucketing: True\n",
617
+ " gradient_accumulation_steps: 2\n",
618
+ " batch_size: 1\n",
619
+ " # The start frame index where your videos should start (Leave this at one for json and folder based training).\n",
620
+ " sample_start_idx: 1\n",
621
+ "\n",
622
+ " # Used for 'folder'. The rate at which your frames are sampled. Does nothing for 'json' and 'single_video' dataset.\n",
623
+ " fps: 16\n",
624
+ "\n",
625
+ " # For 'single_video' and 'json'. The number of frames to \"step\" (1,2,3,4) (frame_step=2) -> (1,3,5,7, ...).\n",
626
+ " frame_step: 1\n",
627
+ "\n",
628
+ " # The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size).\n",
629
+ " n_sample_frames: 24\n",
630
+ "\n",
631
+ " # The prompt when using a a single video file\n",
632
+ " # fallback_prompt: \"A person is riding a bicycle.\"\n",
633
+ "\n",
634
+ "# Validation data parameters.\n",
635
+ "validation_data:\n",
636
+ " # A custom prompt that is different from your training dataset.\n",
637
+ " prompt:\n",
638
+ " - \"Studio Ghibli style. The video showcases a vibrant and lively scene set in the early.\"\n",
639
+ " - \"Studio Ghibli style. A woman with black hair is holding a gun in her hand.\"\n",
640
+ "\n",
641
+ " # Whether or not to sample preview during training (Requires more VRAM).\n",
642
+ " # sample_preview: True\n",
643
+ " sample_preview: False\n",
644
+ "\n",
645
+ " # The number of frames to sample during validation.\n",
646
+ " num_frames: 24\n",
647
+ "\n",
648
+ " # Height and width of validation sample.\n",
649
+ " width: 384\n",
650
+ " height: 384\n",
651
+ "\n",
652
+ " # Number of inference steps when generating the video.\n",
653
+ " num_inference_steps: 15\n",
654
+ "\n",
655
+ " # CFG scale\n",
656
+ " guidance_scale: 12\n",
657
+ "\n",
658
+ " # scale of spatial LoRAs, default is 0\n",
659
+ " spatial_scale: 0\n",
660
+ "\n",
661
+ " # scale of noise prior, i.e. the scale of inversion noises\n",
662
+ " noise_prior: 0\n",
663
+ "\n",
664
+ "use_offset_noise: False\n",
665
+ "offset_noise_strength: 0.\n",
666
+ "\n",
667
+ "# Learning rate for AdamW\n",
668
+ "learning_rate: 5e-4\n",
669
+ "\n",
670
+ "# Weight decay. Higher = more regularization. Lower = closer to dataset.\n",
671
+ "adam_weight_decay: 1e-4\n",
672
+ "\n",
673
+ "# Maximum number of train steps. Model is saved after training.\n",
674
+ "max_train_steps: 5000\n",
675
+ "\n",
676
+ "# Saves a model every nth step.\n",
677
+ "checkpointing_steps: 5000\n",
678
+ "\n",
679
+ "# How many steps to do for validation if sample_preview is enabled.\n",
680
+ "validation_steps: 5000\n",
681
+ "\n",
682
+ "# Whether or not we want to use mixed precision with accelerate\n",
683
+ "mixed_precision: \"fp16\"\n",
684
+ "# mixed_precision: \"no\"\n",
685
+ "\n",
686
+ "# Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.\n",
687
+ "# If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.\n",
688
+ "gradient_checkpointing: True\n",
689
+ "text_encoder_gradient_checkpointing: True\n",
690
+ "\n",
691
+ "# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)\n",
692
+ "enable_xformers_memory_efficient_attention: True\n",
693
+ "use_8bit_adam: True\n",
694
+ "\n",
695
+ "# Use scaled dot product attention (Only available with >= Torch 2.0)\n",
696
+ "enable_torch_2_attn: True"
697
+ ]
698
+ },
699
+ {
700
+ "cell_type": "code",
701
+ "execution_count": 8,
702
+ "metadata": {
703
+ "execution": {
704
+ "iopub.execute_input": "2025-07-16T05:36:02.733520Z",
705
+ "iopub.status.busy": "2025-07-16T05:36:02.732856Z",
706
+ "iopub.status.idle": "2025-07-16T16:01:06.692095Z",
707
+ "shell.execute_reply": "2025-07-16T16:01:06.688451Z",
708
+ "shell.execute_reply.started": "2025-07-16T05:36:02.733496Z"
709
+ },
710
+ "trusted": true
711
+ },
712
+ "outputs": [
713
+ {
714
+ "name": "stdout",
715
+ "output_type": "stream",
716
+ "text": [
717
+ "2025-07-16 05:36:13.391674: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
718
+ "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
719
+ "E0000 00:00:1752644173.574411 316 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
720
+ "E0000 00:00:1752644173.625685 316 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
721
+ "Initializing the conversion map\n",
722
+ "{'rescale_betas_zero_snr', 'timestep_spacing'} was not found in config. Values will be initialized to default values.\n",
723
+ "An error occurred while trying to fetch ./ckpts/zeroscope_v2_576w: Error no file named diffusion_pytorch_model.safetensors found in directory ./ckpts/zeroscope_v2_576w.\n",
724
+ "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.\n",
725
+ "{'latents_mean', 'use_post_quant_conv', 'mid_block_add_attention', 'force_upcast', 'use_quant_conv', 'shift_factor', 'latents_std'} was not found in config. Values will be initialized to default values.\n",
726
+ "All model checkpoint weights were used when initializing AutoencoderKL.\n",
727
+ "\n",
728
+ "All the weights of AutoencoderKL were initialized from the model checkpoint at ./ckpts/zeroscope_v2_576w.\n",
729
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use AutoencoderKL for predictions without further training.\n",
730
+ "An error occurred while trying to fetch ./ckpts/zeroscope_v2_576w: Error no file named diffusion_pytorch_model.safetensors found in directory ./ckpts/zeroscope_v2_576w.\n",
731
+ "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.\n",
732
+ "All model checkpoint weights were used when initializing UNet3DConditionModel.\n",
733
+ "\n",
734
+ "All the weights of UNet3DConditionModel were initialized from the model checkpoint at ./ckpts/zeroscope_v2_576w.\n",
735
+ "If your task is similar to the task the model of the checkpoint was trained on, you can already use UNet3DConditionModel for predictions without further training.\n",
736
+ "Could not enable memory efficient attention for xformers or Torch 2.0.\n",
737
+ "Loading pipeline components...: 0%| | 0/5 [00:00<?, ?it/s]Loaded text_encoder as CLIPTextModel from `text_encoder` subfolder of ./ckpts/zeroscope_v2_576w.\n",
738
+ "Loading pipeline components...: 40%|█████▏ | 2/5 [00:00<00:01, 2.52it/s]{'rescale_betas_zero_snr', 'timestep_spacing'} was not found in config. Values will be initialized to default values.\n",
739
+ "Loaded scheduler as DDIMScheduler from `scheduler` subfolder of ./ckpts/zeroscope_v2_576w.\n",
740
+ "Loaded tokenizer as CLIPTokenizer from `tokenizer` subfolder of ./ckpts/zeroscope_v2_576w.\n",
741
+ "Loading pipeline components...: 100%|█████████████| 5/5 [00:00<00:00, 5.76it/s]\n",
742
+ "Expected types for unet: (<class 'diffusers.models.unets.unet_3d_condition.UNet3DConditionModel'>,), got <class 'models.unet_3d_condition.UNet3DConditionModel'>.\n",
743
+ "The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. \n",
744
+ "Caching Latents.: 100%|███████████████████████| 240/240 [09:41<00:00, 2.42s/it]\n",
745
+ "Lora successfully injected into UNet3DConditionModel.\n",
746
+ "Lora successfully injected into UNet3DConditionModel.\n",
747
+ "unet._set_gradient_checkpointing(unet_enable)\n",
748
+ "Steps: 0%| | 0/5000 [00:00<?, ?it/s]1942 params have been unfrozen for training.\n",
749
+ "/usr/local/lib/python3.11/dist-packages/diffusers/models/transformers/transformer_2d.py:35: FutureWarning: `Transformer2DModelOutput` is deprecated and will be removed in version 1.0.0. Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead.\n",
750
+ " deprecate(\"Transformer2DModelOutput\", \"1.0.0\", deprecation_message)\n",
751
+ "Steps: 100%|█████████████████████████████| 5000/5000 [10:14:13<00:00, 7.28s/it][2025-07-16 16:00:44,146] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
752
+ "[2025-07-16 16:00:46,892] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False\n",
753
+ "\n",
754
+ "Loading pipeline components...: 0%| | 0/5 [00:00<?, ?it/s]\u001b[A{'rescale_betas_zero_snr', 'timestep_spacing'} was not found in config. Values will be initialized to default values.\n",
755
+ "Loaded scheduler as DDIMScheduler from `scheduler` subfolder of ./ckpts/zeroscope_v2_576w.\n",
756
+ "Loaded tokenizer as CLIPTokenizer from `tokenizer` subfolder of ./ckpts/zeroscope_v2_576w.\n",
757
+ "Loading pipeline components...: 100%|█████████████| 5/5 [00:00<00:00, 50.50it/s]\n",
758
+ "Expected types for unet: (<class 'diffusers.models.unets.unet_3d_condition.UNet3DConditionModel'>,), got <class 'models.unet_3d_condition.UNet3DConditionModel'>.\n",
759
+ "The TextToVideoSDPipeline has been deprecated and will not receive bug fixes or feature updates after Diffusers version 0.33.1. \n",
760
+ "Configuration saved in ./zeroscope_v2_576w-Ghibli-LoRA/train_2025-07-16T05-36-26/checkpoint-5000/vae/config.json\n",
761
+ "Model weights saved in ./zeroscope_v2_576w-Ghibli-LoRA/train_2025-07-16T05-36-26/checkpoint-5000/vae/diffusion_pytorch_model.safetensors\n"
762
+ ]
763
+ }
764
+ ],
765
+ "source": [
766
+ "# Train\n",
767
+ "!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True\n",
768
+ "!python main_train.py --config ./configs/config_multi_videos.yaml"
769
+ ]
770
+ }
771
+ ],
772
+ "metadata": {
773
+ "kaggle": {
774
+ "accelerator": "gpu",
775
+ "dataSources": [],
776
+ "dockerImageVersionId": 31090,
777
+ "isGpuEnabled": true,
778
+ "isInternetEnabled": true,
779
+ "language": "python",
780
+ "sourceType": "notebook"
781
+ },
782
+ "kernelspec": {
783
+ "display_name": "Python 3",
784
+ "language": "python",
785
+ "name": "python3"
786
+ },
787
+ "language_info": {
788
+ "codemirror_mode": {
789
+ "name": "ipython",
790
+ "version": 3
791
+ },
792
+ "file_extension": ".py",
793
+ "mimetype": "text/x-python",
794
+ "name": "python",
795
+ "nbconvert_exporter": "python",
796
+ "pygments_lexer": "ipython3",
797
+ "version": "3.11.13"
798
+ }
799
+ },
800
+ "nbformat": 4,
801
+ "nbformat_minor": 4
802
+ }
requirements/requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate>=1.7.0
2
+ deepspeed
3
+ diffusers>=0.33.0
4
+ huggingface-hub
5
+ git+https://github.com/cloneofsimo/lora.git
6
+ git+https://github.com/microsoft/LoRA
7
+ loralib
8
+ numpy==1.26.4
9
+ tqdm
10
+ einops
11
+ imageio
12
+ imageio-ffmpeg
13
+ torch>=2.6.0
14
+ torchvision>=0.21.0
15
+ torchaudio
16
+ transformers>=4.51.3
17
+ decord
18
+ safetensors
19
+ omegaconf
20
+ opencv-python
21
+ pydantic
22
+ triton
23
+ compel
24
+ peft>=0.15.0
25
+ pytorch_lightning>=2.5.0
26
+ bitsandbytes
requirements/requirements_compatible.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ deepspeed==0.17.3
2
+ diffusers==0.34.0
3
+ huggingface_hub==0.34.1
4
+ loralib==0.1.2
5
+ numpy==1.26.4
6
+ tqdm==4.67.1
7
+ einops==0.8.1
8
+ imageio==2.37.0
9
+ imageio_ffmpeg==0.6.0
10
+ torch==2.6.0
11
+ torchvision==0.21.0
12
+ torchaudio==2.6.0
13
+ transformers==4.54.0
14
+ decord==0.6.0
15
+ safetensors==0.5.3
16
+ omegaconf==2.3.0
17
+ cv2==4.11.0
18
+ pydantic==2.11.7
19
+ triton==3.2.0
20
+ compel==2.1.1
21
+ peft==0.16.0
22
+ pytorch_lightning==2.5.2
23
+ bitsandbytes==0.46.1
scripts/download_ckpts.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi, snapshot_download
2
+ import os
3
+ import torch
4
+ import argparse
5
+
6
+ def download_checkpoint(repo_id, save_path, repo_type="model"):
7
+ """
8
+ Download a model checkpoint from Hugging Face Hub to the specified local directory.
9
+
10
+ Args:
11
+ repo_id (str): The repository ID on Hugging Face Hub
12
+ save_path (str): Local directory path to save the checkpoint
13
+ repo_type (str): Type of repository (default: "model")
14
+ """
15
+ # Initialize Hugging Face API
16
+ api = HfApi()
17
+
18
+ # Create the directory if it doesn't exist
19
+ os.makedirs(save_path, exist_ok=True)
20
+
21
+ # Download the checkpoint
22
+ print(f"Downloading {repo_id} to {save_path}...")
23
+ snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=save_path)
24
+ print(f"Successfully downloaded {repo_id}")
25
+
26
+ def main(args):
27
+ # Define checkpoint configurations
28
+ checkpoints = [
29
+ {
30
+ "repo_id": args.repo_id,
31
+ "save_path": args.save_path,
32
+ "repo_type": args.repo_type
33
+ }
34
+ ]
35
+
36
+ # Add LoRA checkpoint if provided
37
+ if args.lora_repo_id and args.lora_save_path:
38
+ checkpoints.append({
39
+ "repo_id": args.lora_repo_id,
40
+ "save_path": args.lora_save_path,
41
+ "repo_type": args.lora_repo_type
42
+ })
43
+
44
+ # Download each checkpoint
45
+ for checkpoint in checkpoints:
46
+ download_checkpoint(
47
+ repo_id=checkpoint["repo_id"],
48
+ save_path=checkpoint["save_path"],
49
+ repo_type=checkpoint["repo_type"]
50
+ )
51
+
52
+ if __name__ == "__main__":
53
+ # Set up argument parser
54
+ parser = argparse.ArgumentParser(description="Download model checkpoints from Hugging Face Hub")
55
+ parser.add_argument(
56
+ "--repo_id",
57
+ type=str,
58
+ default="cerspense/zeroscope_v2_576w",
59
+ help="Hugging Face repository ID for the checkpoint"
60
+ )
61
+ parser.add_argument(
62
+ "--save_path",
63
+ type=str,
64
+ default="./ckpts/zeroscope_v2_576w",
65
+ help="Local directory to save the checkpoint"
66
+ )
67
+ parser.add_argument(
68
+ "--repo_type",
69
+ type=str,
70
+ default="model",
71
+ help="Type of repository (e.g., model, dataset)"
72
+ )
73
+ parser.add_argument(
74
+ "--lora_repo_id",
75
+ type=str,
76
+ default="danhtran2mind/zeroscope_v2_576w-Ghibli-LoRA",
77
+ help="Hugging Face repository ID for the LoRA checkpoint"
78
+ )
79
+ parser.add_argument(
80
+ "--lora_save_path",
81
+ type=str,
82
+ default="./ckpts/zeroscope_v2_576w-Ghibli-LoRA",
83
+ help="Local directory to save the LoRA checkpoint"
84
+ )
85
+ parser.add_argument(
86
+ "--lora_repo_type",
87
+ type=str,
88
+ default="model",
89
+ help="Type of repository for the LoRA checkpoint (e.g., model, dataset)"
90
+ )
91
+
92
+ # Parse arguments
93
+ args = parser.parse_args()
94
+
95
+ # Call main with parsed arguments
96
+ main(args)
scripts/process_dataset.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import random
4
+ import argparse
5
+ from huggingface_hub import snapshot_download
6
+ import zipfile
7
+
8
+ def copy_file_pairs(source_dir, dest_dir, max_pairs=20, seed=None):
9
+ if seed is not None:
10
+ random.seed(seed)
11
+ os.makedirs(dest_dir, exist_ok=True)
12
+ mp4_files = [f for f in os.listdir(source_dir) if f.endswith('.mp4')]
13
+ selected_mp4_files = random.sample(mp4_files, min(len(mp4_files), max_pairs))
14
+ for mp4 in selected_mp4_files:
15
+ base = os.path.splitext(mp4)[0]
16
+ txt = f"{base}.txt"
17
+ if os.path.exists(os.path.join(source_dir, txt)):
18
+ shutil.copy2(os.path.join(source_dir, mp4), os.path.join(dest_dir, mp4))
19
+ shutil.copy2(os.path.join(source_dir, txt), os.path.join(dest_dir, txt))
20
+ return len(selected_mp4_files)
21
+
22
+ if __name__ == "__main__":
23
+ parser = argparse.ArgumentParser(description="Process Studio Ghibli dataset by downloading, extracting, and copying file pairs.")
24
+ parser.add_argument("--source_dir", default="data/ghibli/raw/videos/1920x1040", help="Source directory containing video and text files")
25
+ parser.add_argument("--dest_dir", default="data/ghibli/videos", help="Destination directory for copied file pairs")
26
+ parser.add_argument("--max_pairs", type=int, default=240, help="Maximum number of file pairs to copy")
27
+ parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
28
+ parser.add_argument("--repo_id", default="raymondt/ghibi_t2v", help="Hugging Face dataset repository ID")
29
+ parser.add_argument("--local_dir", default="data/ghibli/raw", help="Local directory to download the dataset")
30
+ parser.add_argument("--zip_path", default="data/ghibli/raw/studio_ghibli_wan14b_t2v_v01_dataset.zip", help="Path to the downloaded zip file")
31
+
32
+ args = parser.parse_args()
33
+
34
+ # Create directory if it doesn't exist
35
+ os.makedirs(args.local_dir, exist_ok=True)
36
+
37
+ # Download the dataset using snapshot_download
38
+ snapshot_download(repo_id=args.repo_id,
39
+ local_dir=args.local_dir,
40
+ repo_type="dataset")
41
+
42
+ # Unzip the dataset
43
+ with zipfile.ZipFile(args.zip_path, 'r') as zip_ref:
44
+ zip_ref.extractall(args.local_dir)
45
+
46
+ # Copy file pairs
47
+ copied = copy_file_pairs(args.source_dir, args.dest_dir, max_pairs=args.max_pairs, seed=args.seed)
48
+ print(f"Copied {copied} pairs to {args.dest_dir}")
scripts/setup_third_party.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import argparse
4
+ import sys
5
+
6
+ def clone_repository(repo_url, target_dir, branch="main"):
7
+ """Clone a git repository to the specified directory with specific branch."""
8
+ if os.path.exists(target_dir):
9
+ print(f"Directory {target_dir} already exists. Skipping clone.")
10
+ return
11
+
12
+ os.makedirs(os.path.dirname(target_dir), exist_ok=True)
13
+
14
+ try:
15
+ subprocess.run(
16
+ ["git", "clone", "-b", branch, repo_url, target_dir],
17
+ check=True,
18
+ capture_output=True,
19
+ text=True
20
+ )
21
+ print(f"Successfully cloned {repo_url} (branch: {branch}) to {target_dir}")
22
+ except subprocess.CalledProcessError as e:
23
+ print(f"Failed to clone repository: {e.stderr}")
24
+ sys.exit(1)
25
+
26
+ def main(motiondirector_url="https://github.com/danhtran2mind/MotionDirector", branch="main"):
27
+ # Define target directory
28
+ target_dir = os.path.join("src", "third_party", "MotionDirector")
29
+
30
+ # Clone MotionDirector repository
31
+ clone_repository(motiondirector_url, target_dir, branch)
32
+
33
+ if __name__ == "__main__":
34
+ # Set arguments directly
35
+ main(
36
+ motiondirector_url="https://github.com/danhtran2mind/MotionDirector",
37
+ branch="main"
38
+ )
src/text2video_ghibli_style/inference.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+ import uuid
6
+ import torch
7
+
8
+ # Append the current directory to sys.path
9
+ # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ def run_inference(
12
+ model_path="./ckpts/zeroscope_v2_576w",
13
+ checkpoint_folder="./ckpts/zeroscope_v2_576w-Ghibli-LoRA",
14
+ prompt="Studio Ghibli style. Two women walk down coastal village path toward sea, passing colorful houses, sailboats visible.",
15
+ negative_prompt="ugly, noise, fragment, blur, static video",
16
+ width=256,
17
+ height=256,
18
+ num_frames=8,
19
+ num_steps=30,
20
+ guidance_scale=30.0,
21
+ fps=8,
22
+ lora_rank=32,
23
+ lora_scale=0.7,
24
+ noise_prior=0.1,
25
+ device="cuda",
26
+ seed=100
27
+ ):
28
+ print("Start Inference")
29
+ output_dir = "apps/gradio_app/temp_data"
30
+ os.makedirs(output_dir, exist_ok=True)
31
+
32
+ # Get list of files in output_dir
33
+ for file_name in os.listdir(output_dir):
34
+ # Check if file ends with .mp4
35
+ if file_name.endswith(".mp4"):
36
+ # Remove the file
37
+ os.remove(os.path.join(output_dir, file_name))
38
+
39
+ command = [
40
+ "python", "src/third_party/MotionDirector/main_inference.py",
41
+ "--model", model_path,
42
+ "--checkpoint_folder", checkpoint_folder,
43
+ "--prompt", prompt,
44
+ "--negative-prompt", negative_prompt,
45
+ "--width", str(width),
46
+ "--height", str(height),
47
+ "--num-frames", str(num_frames),
48
+ "--num-steps", str(num_steps),
49
+ "--guidance-scale", str(guidance_scale),
50
+ "--fps", str(fps),
51
+ "--lora_rank", str(lora_rank),
52
+ "--lora_scale", str(lora_scale),
53
+ "--noise_prior", str(noise_prior),
54
+ "--device", device,
55
+ "--seed", str(seed),
56
+ "--output_dir", output_dir,
57
+ # "--no-prompt-name"
58
+ ]
59
+
60
+ # Use Popen to execute the command
61
+ process = subprocess.Popen(
62
+ command,
63
+ stdout=subprocess.PIPE,
64
+ stderr=subprocess.PIPE,
65
+ text=True,
66
+ bufsize=1 # Line buffering
67
+ )
68
+
69
+ # Read output line-by-line in real-time
70
+ output_lines = []
71
+ try:
72
+ for line in process.stdout:
73
+ output_lines.append(line.strip())
74
+ except Exception as e:
75
+ return None, f"Error reading output: {str(e)}"
76
+
77
+ # Capture stderr and wait for process to complete
78
+ stderr_output = process.communicate()[1]
79
+ if process.returncode != 0:
80
+ return None, f"Error: {stderr_output.strip()}"
81
+
82
+ # Check for MP4 files in output directory
83
+ output_file = [f for f in os.listdir(output_dir) if f.lower().endswith('.mp4')]
84
+ if output_file:
85
+ output_path = os.path.join(output_dir, output_file[-1])
86
+ if os.path.exists(output_path):
87
+ return output_path, "\n".join(output_lines)
88
+ else:
89
+ return None, f"Video file not found at {output_path}\nLogs:\n" + "\n".join(output_lines)
90
+ return None, f"No MP4 files found in {output_dir}\nLogs:\n" + "\n".join(output_lines)
91
+
92
+ if __name__ == "__main__":
93
+ # Example usage
94
+ video_path, logs = run_inference(device="cpu" if not torch.cuda.is_available() else "cuda")
95
+ print(f"Generated Video: {video_path}")
96
+ print(f"Logs: {logs}")
src/text2video_ghibli_style/train.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import os
3
+ import sys
4
+ import argparse
5
+
6
+ # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'third_party', 'MotionDirector')))
7
+
8
+ def run_training(config_path, pytorch_cuda_alloc_conf="expandable_segments:True"):
9
+ # Set the environment variable
10
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = pytorch_cuda_alloc_conf
11
+
12
+ # Command to execute
13
+ command = ["python", "src/third_party/MotionDirector/main_train.py", "--config", config_path]
14
+
15
+ try:
16
+ # Run the command using subprocess.Popen
17
+ process = subprocess.Popen(
18
+ command,
19
+ stdout=subprocess.PIPE,
20
+ stderr=subprocess.PIPE,
21
+ text=True,
22
+ env=os.environ.copy()
23
+ )
24
+
25
+ # Stream output in real-time
26
+ while True:
27
+ output = process.stdout.readline()
28
+ if output == '' and process.poll() is not None:
29
+ break
30
+ if output:
31
+ print(output.strip())
32
+
33
+ # Get any remaining output and errors
34
+ stdout, stderr = process.communicate()
35
+
36
+ # Print any errors
37
+ if stderr:
38
+ print("Errors:", stderr)
39
+
40
+ # Check the return code
41
+ if process.returncode == 0:
42
+ print("Training completed successfully")
43
+ else:
44
+ print(f"Training failed with return code: {process.returncode}")
45
+
46
+ except subprocess.SubprocessError as e:
47
+ print(f"Error running training: {e}")
48
+ except FileNotFoundError:
49
+ print("Error: main_train.py or config file not found")
50
+ except Exception as e:
51
+ print(f"Unexpected error: {e}")
52
+
53
+ if __name__ == "__main__":
54
+ # Set up argument parser
55
+ parser = argparse.ArgumentParser(description="Run training script with specified config")
56
+ parser.add_argument(
57
+ "--config",
58
+ type=str,
59
+ default="./configs/config_multi_videos.yaml",
60
+ help="Path to the config file"
61
+ )
62
+ parser.add_argument(
63
+ "--pytorch-cuda-alloc",
64
+ type=str,
65
+ default="expandable_segments:True",
66
+ help="Value for PYTORCH_CUDA_ALLOC_CONF environment variable"
67
+ )
68
+
69
+ # Parse arguments
70
+ args = parser.parse_args()
71
+
72
+ # Run training with provided arguments
73
+ run_training(args.config, args.pytorch_cuda_alloc)
src/third_party/.gitkeep ADDED
File without changes