wan2-1-VACE-fast

Running on Zero

App Files Files Community

Update app.py

by linoyts HF Staff - opened Jul 25

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+54

-37

Files changed (1) hide show

app.py +54 -37

app.py CHANGED Viewed

@@ -1,23 +1,20 @@
 import torch
 from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
 from diffusers.utils import export_to_video
-from transformers import CLIPVisionModel
 import gradio as gr
 import tempfile
 import spaces
 from huggingface_hub import hf_hub_download
 import numpy as np
-import PIL.Image
 import random
 model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
 vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
 pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
-pipe.to("cuda")
 pipe.load_lora_weights(
    "vrgamedevgirl84/Wan14BT2VFusioniX",
@@ -80,7 +77,7 @@ def handle_gallery_upload_for_dims_wan(gallery_images, current_h_val, current_w_
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
         # Use the first image to calculate dimensions
-        first_image = gallery_images[0]
         new_h, new_w = _calculate_new_dimensions_wan(
             first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
@@ -96,17 +93,17 @@ def update_prompt_from_mode(mode):
     return MODE_PROMPTS.get(mode, "")
-def prepare_video_and_mask_Ref2V( height: int, width: int, num_frames: int):
     frames = []
     # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
     # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
     # match the original code.
-    frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
-    mask_white = PIL.Image.new("L", (width, height), 255)
     mask = [mask_white] * (num_frames)
     return frames, mask
-def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image.Image, height: int, width: int, num_frames: int):
     first_img = first_img.resize((width, height))
     last_img = last_img.resize((width, height))
     frames = []
@@ -114,26 +111,26 @@ def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image
     # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
     # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
     # match the original code.
-    frames.extend([PIL.Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
     frames.append(last_img)
-    mask_black = PIL.Image.new("L", (width, height), 0)
-    mask_white = PIL.Image.new("L", (width, height), 255)
     mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
     return frames, mask
-def prepare_video_and_mask_Random2V(images: List[PIL.Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
     images = [img.resize((width, height)) for img in images]
     # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
     # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
     # match the original code.
-    frames = [PIL.Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
-    mask_black = PIL.Image.new("L", (width, height), 0)
-    mask_white = PIL.Image.new("L", (width, height), 255)
     mask = [mask_white] * num_frames
     for img, idx in zip(images, frame_indices):
-        assert idx < num_frames
         frames[idx] = img
         mask[idx] = mask_black
@@ -179,11 +176,13 @@ def generate_video(gallery_images, mode, prompt, height, width,
     """
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
     if mode == "FLF2V" and len(gallery_images) >= 2:
         gallery_images = gallery_images[:2]
     elif mode == "FLF2V" and len(gallery_images) < 2:
-        raise gr.Error("only one image was supplied, but 2 are needed for FLF2V")
     target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
@@ -192,20 +191,29 @@ def generate_video(gallery_images, mode, prompt, height, width,
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     # Process images based on the selected mode
     if mode == "FLF2V":
-        frames, mask = prepare_video_and_mask_FLF2V(first_img=gallery_images[0], last_img=gallery_images[1], height=target_h, width=target_w, num_frames=num_frames)
-        reference_images=None
     elif mode == "Ref2V":
         frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
-        reference_images =gallery_images
-    else: # mode == "":
-        frames, mask = prepare_video_and_mask_Random2V(images=gallery_images, frame_indices=[0,15,40], height=target_h, width=target_w, num_frames=num_frames)
-        reference_images=None
-    # resized_image = input_image.resize((target_w, target_h))
     with torch.inference_mode():
         output_frames_list = pipe(
@@ -228,8 +236,8 @@ def generate_video(gallery_images, mode, prompt, height, width,
     return video_path, current_seed
 with gr.Blocks() as demo:
-    gr.Markdown("# Fast 4 steps Wan 2.1 I2V (14B) with CausVid LoRA - Multi-Image Gallery")
-    gr.Markdown("[CausVid](https://github.com/tianweiy/CausVid) is a distilled version of Wan 2.1 to run faster in just 4-8 steps, [extracted as LoRA by Kijai](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors) and is compatible with 🧨 diffusers")
     with gr.Row():
         with gr.Column():
@@ -251,11 +259,18 @@ with gr.Blocks() as demo:
                 choices=["Ref2V", "FLF2V", "Random2V"],
                 value="Ref2V",
                 label="Processing Mode",
-                info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random Image to Video"
             )
             prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
-            duration_seconds_input = gr.Slider(minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1), maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1), step=0.1, value=2, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
@@ -271,12 +286,14 @@ with gr.Blocks() as demo:
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
-            with gr.Accordion("Mode Information", open=True):
                 gr.Markdown("""
                 **Processing Modes:**
-                - **Ref2V**: Uses the first image as reference for video generation
-                - **FLF2V**: Blends first and last images for interpolation (requires at least 2 images)
-                - **Random2V**: Randomly selects one image from the gallery for generation
                 """)
     # Update prompt when mode changes

 import torch
+from typing import List
 from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
 from diffusers.utils import export_to_video
 import gradio as gr
 import tempfile
 import spaces
 from huggingface_hub import hf_hub_download
 import numpy as np
+from PIL import Image
 import random
 model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
 vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
 pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
 pipe.load_lora_weights(
    "vrgamedevgirl84/Wan14BT2VFusioniX",
         return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
         # Use the first image to calculate dimensions
+        first_image = gallery_images[0][0]
         new_h, new_w = _calculate_new_dimensions_wan(
             first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
     return MODE_PROMPTS.get(mode, "")
+def prepare_video_and_mask_Ref2V(height: int, width: int, num_frames: int):
     frames = []
     # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
     # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
     # match the original code.
+    frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
+    mask_white = Image.new("L", (width, height), 255)
     mask = [mask_white] * (num_frames)
     return frames, mask
+def prepare_video_and_mask_FLF2V(first_img: Image.Image, last_img: Image.Image, height: int, width: int, num_frames: int):
     first_img = first_img.resize((width, height))
     last_img = last_img.resize((width, height))
     frames = []
     # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
     # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
     # match the original code.
+    frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
     frames.append(last_img)
+    mask_black = Image.new("L", (width, height), 0)
+    mask_white = Image.new("L", (width, height), 255)
     mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
     return frames, mask
+def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
     images = [img.resize((width, height)) for img in images]
     # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
     # whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
     # match the original code.
+    frames = [Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
+    mask_black = Image.new("L", (width, height), 0)
+    mask_white = Image.new("L", (width, height), 255)
     mask = [mask_white] * num_frames
     for img, idx in zip(images, frame_indices):
+        assert idx < num_frames, f"Frame index {idx} exceeds num_frames {num_frames}"
         frames[idx] = img
         mask[idx] = mask_black
     """
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
+    else:
+        gallery_images = [img[0] for img in gallery_images]
     if mode == "FLF2V" and len(gallery_images) >= 2:
         gallery_images = gallery_images[:2]
     elif mode == "FLF2V" and len(gallery_images) < 2:
+        raise gr.Error("FLF2V mode requires at least 2 images, but only {} were supplied.".format(len(gallery_images)))
     target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     # Process images based on the selected mode
     if mode == "FLF2V":
+        frames, mask = prepare_video_and_mask_FLF2V(
+            first_img=gallery_images[0],
+            last_img=gallery_images[1],
+            height=target_h,
+            width=target_w,
+            num_frames=num_frames
+        )
+        reference_images = None
     elif mode == "Ref2V":
         frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
+        reference_images = gallery_images
+    else:  # mode == "Random2V"
+        frames, mask = prepare_video_and_mask_Random2V(
+            images=gallery_images,
+            frame_indices=[0,20,40], # todo - generalize
+            height=target_h,
+            width=target_w,
+            num_frames=num_frames
+        )
+        reference_images = None
     with torch.inference_mode():
         output_frames_list = pipe(
     return video_path, current_seed
 with gr.Blocks() as demo:
+    gr.Markdown("# Wan 2.1 VACE (14B) with Phantom & Detail Enhancer LoRAs - Multi-Image Gallery")
+    gr.Markdown("Using [Wan2.1-VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) with Phantom FusionX and Detail Enhancer LoRAs for advanced video generation with multiple conditioning modes.")
     with gr.Row():
         with gr.Column():
                 choices=["Ref2V", "FLF2V", "Random2V"],
                 value="Ref2V",
                 label="Processing Mode",
+                info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random frames to Video"
             )
             prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
+            duration_seconds_input = gr.Slider(
+                minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
+                maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
+                step=0.1,
+                value=2,
+                label="Duration (seconds)",
+                info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
+            )
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
+            with gr.Accordion("Mode Information", open=False):
                 gr.Markdown("""
                 **Processing Modes:**
+                - **Ref2V**: Uses uploaded images as style references for video generation. All frames are generated based on the reference images.
+                - **FLF2V**: First-Last Frame mode - uses first and last images as keyframes and generates the frames in between (requires exactly 2 images)
+                - **Random2V**: Places uploaded images at specific frames in the video and generates the rest. Images are distributed evenly across the video duration.
+                **Note**: VACE pipeline supports advanced conditioning with masks and reference images for more control over generation.
                 """)
     # Update prompt when mode changes