Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
#2
by
linoyts
HF Staff
- opened
app.py
CHANGED
|
@@ -1,23 +1,20 @@
|
|
| 1 |
import torch
|
|
|
|
| 2 |
from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
|
| 3 |
from diffusers.utils import export_to_video
|
| 4 |
-
from transformers import CLIPVisionModel
|
| 5 |
import gradio as gr
|
| 6 |
import tempfile
|
| 7 |
import spaces
|
| 8 |
from huggingface_hub import hf_hub_download
|
| 9 |
import numpy as np
|
| 10 |
-
import
|
| 11 |
import random
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
|
| 16 |
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
| 17 |
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
|
| 18 |
|
| 19 |
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
|
| 20 |
-
pipe.to("cuda")
|
| 21 |
|
| 22 |
pipe.load_lora_weights(
|
| 23 |
"vrgamedevgirl84/Wan14BT2VFusioniX",
|
|
@@ -80,7 +77,7 @@ def handle_gallery_upload_for_dims_wan(gallery_images, current_h_val, current_w_
|
|
| 80 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
| 81 |
try:
|
| 82 |
# Use the first image to calculate dimensions
|
| 83 |
-
first_image = gallery_images[0]
|
| 84 |
new_h, new_w = _calculate_new_dimensions_wan(
|
| 85 |
first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
|
| 86 |
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
|
|
@@ -96,17 +93,17 @@ def update_prompt_from_mode(mode):
|
|
| 96 |
return MODE_PROMPTS.get(mode, "")
|
| 97 |
|
| 98 |
|
| 99 |
-
def prepare_video_and_mask_Ref2V(
|
| 100 |
frames = []
|
| 101 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
| 102 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
| 103 |
# match the original code.
|
| 104 |
-
frames.extend([
|
| 105 |
-
mask_white =
|
| 106 |
mask = [mask_white] * (num_frames)
|
| 107 |
return frames, mask
|
| 108 |
|
| 109 |
-
def prepare_video_and_mask_FLF2V(first_img:
|
| 110 |
first_img = first_img.resize((width, height))
|
| 111 |
last_img = last_img.resize((width, height))
|
| 112 |
frames = []
|
|
@@ -114,26 +111,26 @@ def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image
|
|
| 114 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
| 115 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
| 116 |
# match the original code.
|
| 117 |
-
frames.extend([
|
| 118 |
frames.append(last_img)
|
| 119 |
-
mask_black =
|
| 120 |
-
mask_white =
|
| 121 |
mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
|
| 122 |
return frames, mask
|
| 123 |
|
| 124 |
-
def prepare_video_and_mask_Random2V(images: List[
|
| 125 |
images = [img.resize((width, height)) for img in images]
|
| 126 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
| 127 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
| 128 |
# match the original code.
|
| 129 |
-
frames = [
|
| 130 |
|
| 131 |
-
mask_black =
|
| 132 |
-
mask_white =
|
| 133 |
mask = [mask_white] * num_frames
|
| 134 |
|
| 135 |
for img, idx in zip(images, frame_indices):
|
| 136 |
-
assert idx < num_frames
|
| 137 |
frames[idx] = img
|
| 138 |
mask[idx] = mask_black
|
| 139 |
|
|
@@ -179,11 +176,13 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
| 179 |
"""
|
| 180 |
if gallery_images is None or len(gallery_images) == 0:
|
| 181 |
raise gr.Error("Please upload at least one image to the gallery.")
|
|
|
|
|
|
|
| 182 |
|
| 183 |
if mode == "FLF2V" and len(gallery_images) >= 2:
|
| 184 |
gallery_images = gallery_images[:2]
|
| 185 |
elif mode == "FLF2V" and len(gallery_images) < 2:
|
| 186 |
-
raise gr.Error("
|
| 187 |
|
| 188 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
| 189 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
|
@@ -192,20 +191,29 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
| 192 |
|
| 193 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
| 194 |
|
| 195 |
-
|
| 196 |
# Process images based on the selected mode
|
| 197 |
if mode == "FLF2V":
|
| 198 |
-
frames, mask = prepare_video_and_mask_FLF2V(
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
elif mode == "Ref2V":
|
| 201 |
frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
|
| 202 |
-
reference_images =gallery_images
|
| 203 |
-
else:
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
with torch.inference_mode():
|
| 211 |
output_frames_list = pipe(
|
|
@@ -228,8 +236,8 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
| 228 |
return video_path, current_seed
|
| 229 |
|
| 230 |
with gr.Blocks() as demo:
|
| 231 |
-
gr.Markdown("#
|
| 232 |
-
gr.Markdown("[
|
| 233 |
|
| 234 |
with gr.Row():
|
| 235 |
with gr.Column():
|
|
@@ -251,11 +259,18 @@ with gr.Blocks() as demo:
|
|
| 251 |
choices=["Ref2V", "FLF2V", "Random2V"],
|
| 252 |
value="Ref2V",
|
| 253 |
label="Processing Mode",
|
| 254 |
-
info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random
|
| 255 |
)
|
| 256 |
|
| 257 |
prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
|
| 258 |
-
duration_seconds_input = gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
with gr.Accordion("Advanced Settings", open=False):
|
| 261 |
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
|
|
@@ -271,12 +286,14 @@ with gr.Blocks() as demo:
|
|
| 271 |
|
| 272 |
with gr.Column():
|
| 273 |
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
|
| 274 |
-
with gr.Accordion("Mode Information", open=
|
| 275 |
gr.Markdown("""
|
| 276 |
**Processing Modes:**
|
| 277 |
-
- **Ref2V**: Uses
|
| 278 |
-
- **FLF2V**:
|
| 279 |
-
- **Random2V**:
|
|
|
|
|
|
|
| 280 |
""")
|
| 281 |
|
| 282 |
# Update prompt when mode changes
|
|
|
|
| 1 |
import torch
|
| 2 |
+
from typing import List
|
| 3 |
from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
|
| 4 |
from diffusers.utils import export_to_video
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
import tempfile
|
| 7 |
import spaces
|
| 8 |
from huggingface_hub import hf_hub_download
|
| 9 |
import numpy as np
|
| 10 |
+
from PIL import Image
|
| 11 |
import random
|
| 12 |
|
|
|
|
|
|
|
| 13 |
model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
|
| 14 |
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
| 15 |
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
|
| 16 |
|
| 17 |
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
|
|
|
|
| 18 |
|
| 19 |
pipe.load_lora_weights(
|
| 20 |
"vrgamedevgirl84/Wan14BT2VFusioniX",
|
|
|
|
| 77 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
| 78 |
try:
|
| 79 |
# Use the first image to calculate dimensions
|
| 80 |
+
first_image = gallery_images[0][0]
|
| 81 |
new_h, new_w = _calculate_new_dimensions_wan(
|
| 82 |
first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
|
| 83 |
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
|
|
|
|
| 93 |
return MODE_PROMPTS.get(mode, "")
|
| 94 |
|
| 95 |
|
| 96 |
+
def prepare_video_and_mask_Ref2V(height: int, width: int, num_frames: int):
|
| 97 |
frames = []
|
| 98 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
| 99 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
| 100 |
# match the original code.
|
| 101 |
+
frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
|
| 102 |
+
mask_white = Image.new("L", (width, height), 255)
|
| 103 |
mask = [mask_white] * (num_frames)
|
| 104 |
return frames, mask
|
| 105 |
|
| 106 |
+
def prepare_video_and_mask_FLF2V(first_img: Image.Image, last_img: Image.Image, height: int, width: int, num_frames: int):
|
| 107 |
first_img = first_img.resize((width, height))
|
| 108 |
last_img = last_img.resize((width, height))
|
| 109 |
frames = []
|
|
|
|
| 111 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
| 112 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
| 113 |
# match the original code.
|
| 114 |
+
frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
|
| 115 |
frames.append(last_img)
|
| 116 |
+
mask_black = Image.new("L", (width, height), 0)
|
| 117 |
+
mask_white = Image.new("L", (width, height), 255)
|
| 118 |
mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
|
| 119 |
return frames, mask
|
| 120 |
|
| 121 |
+
def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
|
| 122 |
images = [img.resize((width, height)) for img in images]
|
| 123 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
| 124 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
| 125 |
# match the original code.
|
| 126 |
+
frames = [Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
|
| 127 |
|
| 128 |
+
mask_black = Image.new("L", (width, height), 0)
|
| 129 |
+
mask_white = Image.new("L", (width, height), 255)
|
| 130 |
mask = [mask_white] * num_frames
|
| 131 |
|
| 132 |
for img, idx in zip(images, frame_indices):
|
| 133 |
+
assert idx < num_frames, f"Frame index {idx} exceeds num_frames {num_frames}"
|
| 134 |
frames[idx] = img
|
| 135 |
mask[idx] = mask_black
|
| 136 |
|
|
|
|
| 176 |
"""
|
| 177 |
if gallery_images is None or len(gallery_images) == 0:
|
| 178 |
raise gr.Error("Please upload at least one image to the gallery.")
|
| 179 |
+
else:
|
| 180 |
+
gallery_images = [img[0] for img in gallery_images]
|
| 181 |
|
| 182 |
if mode == "FLF2V" and len(gallery_images) >= 2:
|
| 183 |
gallery_images = gallery_images[:2]
|
| 184 |
elif mode == "FLF2V" and len(gallery_images) < 2:
|
| 185 |
+
raise gr.Error("FLF2V mode requires at least 2 images, but only {} were supplied.".format(len(gallery_images)))
|
| 186 |
|
| 187 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
| 188 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
|
|
|
| 191 |
|
| 192 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
| 193 |
|
|
|
|
| 194 |
# Process images based on the selected mode
|
| 195 |
if mode == "FLF2V":
|
| 196 |
+
frames, mask = prepare_video_and_mask_FLF2V(
|
| 197 |
+
first_img=gallery_images[0],
|
| 198 |
+
last_img=gallery_images[1],
|
| 199 |
+
height=target_h,
|
| 200 |
+
width=target_w,
|
| 201 |
+
num_frames=num_frames
|
| 202 |
+
)
|
| 203 |
+
reference_images = None
|
| 204 |
elif mode == "Ref2V":
|
| 205 |
frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
|
| 206 |
+
reference_images = gallery_images
|
| 207 |
+
else: # mode == "Random2V"
|
| 208 |
+
|
| 209 |
+
frames, mask = prepare_video_and_mask_Random2V(
|
| 210 |
+
images=gallery_images,
|
| 211 |
+
frame_indices=[0,20,40], # todo - generalize
|
| 212 |
+
height=target_h,
|
| 213 |
+
width=target_w,
|
| 214 |
+
num_frames=num_frames
|
| 215 |
+
)
|
| 216 |
+
reference_images = None
|
| 217 |
|
| 218 |
with torch.inference_mode():
|
| 219 |
output_frames_list = pipe(
|
|
|
|
| 236 |
return video_path, current_seed
|
| 237 |
|
| 238 |
with gr.Blocks() as demo:
|
| 239 |
+
gr.Markdown("# Wan 2.1 VACE (14B) with Phantom & Detail Enhancer LoRAs - Multi-Image Gallery")
|
| 240 |
+
gr.Markdown("Using [Wan2.1-VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) with Phantom FusionX and Detail Enhancer LoRAs for advanced video generation with multiple conditioning modes.")
|
| 241 |
|
| 242 |
with gr.Row():
|
| 243 |
with gr.Column():
|
|
|
|
| 259 |
choices=["Ref2V", "FLF2V", "Random2V"],
|
| 260 |
value="Ref2V",
|
| 261 |
label="Processing Mode",
|
| 262 |
+
info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random frames to Video"
|
| 263 |
)
|
| 264 |
|
| 265 |
prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
|
| 266 |
+
duration_seconds_input = gr.Slider(
|
| 267 |
+
minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
|
| 268 |
+
maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
|
| 269 |
+
step=0.1,
|
| 270 |
+
value=2,
|
| 271 |
+
label="Duration (seconds)",
|
| 272 |
+
info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
|
| 273 |
+
)
|
| 274 |
|
| 275 |
with gr.Accordion("Advanced Settings", open=False):
|
| 276 |
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
|
|
|
|
| 286 |
|
| 287 |
with gr.Column():
|
| 288 |
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
|
| 289 |
+
with gr.Accordion("Mode Information", open=False):
|
| 290 |
gr.Markdown("""
|
| 291 |
**Processing Modes:**
|
| 292 |
+
- **Ref2V**: Uses uploaded images as style references for video generation. All frames are generated based on the reference images.
|
| 293 |
+
- **FLF2V**: First-Last Frame mode - uses first and last images as keyframes and generates the frames in between (requires exactly 2 images)
|
| 294 |
+
- **Random2V**: Places uploaded images at specific frames in the video and generates the rest. Images are distributed evenly across the video duration.
|
| 295 |
+
|
| 296 |
+
**Note**: VACE pipeline supports advanced conditioning with masks and reference images for more control over generation.
|
| 297 |
""")
|
| 298 |
|
| 299 |
# Update prompt when mode changes
|