wan2-1-VACE-fast

Running on Zero

App Files Files Community

linoyts HF Staff commited on Jul 25

Commit

c1a8afc

verified ·

1 Parent(s): c5dbef0

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -9

app.py CHANGED Viewed

@@ -9,11 +9,15 @@ from huggingface_hub import hf_hub_download
 import numpy as np
 from PIL import Image
 import random
 model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
 vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
 pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
 pipe.load_lora_weights(
@@ -89,6 +93,91 @@ def remove_alpha_channel(image: Image.Image) -> Image.Image:
         return image
 def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
                                  min_slider_h, max_slider_h,
                                  min_slider_w, max_slider_w,
@@ -228,20 +317,26 @@ def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: Li
 def get_duration(gallery_images, mode, prompt, height, width,
                    negative_prompt, duration_seconds,
                    guidance_scale, steps,
-                   seed, randomize_seed,
                    progress):
     if steps > 4 and duration_seconds > 2:
-        return 90
     elif steps > 4 or duration_seconds > 2:
-        return 75
-    else:
-        return 60
 @spaces.GPU(duration=get_duration)
 def generate_video(gallery_images, mode, prompt, height, width,
                    negative_prompt=default_negative_prompt, duration_seconds = 2,
                    guidance_scale = 1, steps = 4,
-                   seed = 42, randomize_seed = False,
                    progress=gr.Progress(track_tqdm=True)):
     """
     Generate a video from gallery images using the selected mode.
@@ -258,6 +353,7 @@ def generate_video(gallery_images, mode, prompt, height, width,
         steps (int): Number of inference steps
         seed (int): Random seed for reproducible results
         randomize_seed (bool): Whether to use a random seed
         progress (gr.Progress): Gradio progress tracker
     Returns:
@@ -266,8 +362,20 @@ def generate_video(gallery_images, mode, prompt, height, width,
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
     else:
-        # Remove alpha channels from all uploaded images
-        gallery_images = [remove_alpha_channel(img[0]) for img in gallery_images]
     if mode == "FLF2V" and len(gallery_images) >= 2:
         gallery_images = gallery_images[:2]
@@ -377,6 +485,14 @@ with gr.Blocks() as demo:
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
                 with gr.Row():
@@ -390,6 +506,10 @@ with gr.Blocks() as demo:
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
     # Update prompt when mode changes
     mode_radio.change(
         fn=update_prompt_from_mode,
@@ -397,6 +517,13 @@ with gr.Blocks() as demo:
         outputs=[prompt_input]
     )
     # Update dimensions when gallery changes
     gallery_component.change(
         fn=handle_gallery_upload_for_dims_wan,
@@ -407,7 +534,7 @@ with gr.Blocks() as demo:
     ui_inputs = [
         gallery_component, mode_radio, prompt_input, height_input, width_input,
         negative_prompt_input, duration_seconds_input,
-        guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])

 import numpy as np
 from PIL import Image
 import random
+from briarmbg import BriaRMBG
 model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
 vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
 pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
+# Initialize background removal model
+rmbg = BriaRMBG.from_pretrained("briaai/RMBG-1.4").to("cuda", dtype=torch.float32)
 pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
 pipe.load_lora_weights(
         return image
+@torch.inference_mode()
+def numpy2pytorch(imgs):
+    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0  # so that 127 must be strictly 0.0
+    h = h.movedim(-1, 1)
+    return h
+@torch.inference_mode()
+def pytorch2numpy(imgs, quant=True):
+    results = []
+    for x in imgs:
+        y = x.movedim(0, -1)
+        if quant:
+            y = y * 127.5 + 127.5
+            y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
+        else:
+            y = y * 0.5 + 0.5
+            y = y.detach().float().cpu().numpy().clip(0, 1).astype(np.float32)
+        results.append(y)
+    return results
+def resize_without_crop(image, target_width, target_height):
+    pil_image = Image.fromarray(image)
+    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
+    return np.array(resized_image)
+@torch.inference_mode()
+def run_rmbg(img, sigma=0.0):
+    """
+    Remove background from image using BriaRMBG model.
+    Args:
+        img (np.ndarray): Input image as numpy array (H, W, C)
+        sigma (float): Noise parameter for blending
+    Returns:
+        tuple: (result_image, alpha_mask) where result_image is the image with background removed
+    """
+    H, W, C = img.shape
+    assert C == 3
+    k = (256.0 / float(H * W)) ** 0.5
+    feed = resize_without_crop(img, int(64 * round(W * k)), int(64 * round(H * k)))
+    feed = numpy2pytorch([feed]).to(device="cuda", dtype=torch.float32)
+    alpha = rmbg(feed)[0][0]
+    alpha = torch.nn.functional.interpolate(alpha, size=(H, W), mode="bilinear")
+    alpha = alpha.movedim(1, -1)[0]
+    alpha = alpha.detach().float().cpu().numpy().clip(0, 1)
+    result = 127 + (img.astype(np.float32) - 127 + sigma) * alpha
+    return result.clip(0, 255).astype(np.uint8), alpha
+def remove_background_from_image(image: Image.Image) -> Image.Image:
+    """
+    Remove background from PIL Image using RMBG model.
+    Args:
+        image (Image.Image): Input PIL image
+    Returns:
+        Image.Image: Image with background removed (transparent background)
+    """
+    # Convert PIL to numpy array
+    img_array = np.array(image)
+    # Remove background using RMBG
+    result_array, alpha_mask = run_rmbg(img_array)
+    # Convert back to PIL with alpha channel
+    result_image = Image.fromarray(result_array)
+    # Create RGBA image with alpha mask
+    if result_image.mode != 'RGBA':
+        result_image = result_image.convert('RGBA')
+    # Apply alpha mask
+    alpha_pil = Image.fromarray((alpha_mask * 255).astype(np.uint8), mode='L')
+    result_image.putalpha(alpha_pil)
+    return result_image
 def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
                                  min_slider_h, max_slider_h,
                                  min_slider_w, max_slider_w,
 def get_duration(gallery_images, mode, prompt, height, width,
                    negative_prompt, duration_seconds,
                    guidance_scale, steps,
+                   seed, randomize_seed, remove_bg,
                    progress):
+    # Add extra time if background removal is enabled
+    base_duration = 60
     if steps > 4 and duration_seconds > 2:
+        base_duration = 90
     elif steps > 4 or duration_seconds > 2:
+        base_duration = 75
+    # Add extra time for background removal processing
+    if mode == "Ref2V" and remove_bg:
+        base_duration += 30
+    return base_duration
 @spaces.GPU(duration=get_duration)
 def generate_video(gallery_images, mode, prompt, height, width,
                    negative_prompt=default_negative_prompt, duration_seconds = 2,
                    guidance_scale = 1, steps = 4,
+                   seed = 42, randomize_seed = False, remove_bg = False,
                    progress=gr.Progress(track_tqdm=True)):
     """
     Generate a video from gallery images using the selected mode.
         steps (int): Number of inference steps
         seed (int): Random seed for reproducible results
         randomize_seed (bool): Whether to use a random seed
+        remove_bg (bool): Whether to remove background from images (Ref2V mode only)
         progress (gr.Progress): Gradio progress tracker
     Returns:
     if gallery_images is None or len(gallery_images) == 0:
         raise gr.Error("Please upload at least one image to the gallery.")
     else:
+        # Process images: remove background if requested (Ref2V mode only), then remove alpha channels
+        processed_images = []
+        for img in gallery_images:
+            image = img[0]  # Extract PIL image from gallery format
+            # Apply background removal only for Ref2V mode if checkbox is checked
+            if mode == "Ref2V" and remove_bg:
+                image = remove_background_from_image(image)
+            # Always remove alpha channels to ensure RGB format
+            image = remove_alpha_channel(image)
+            processed_images.append(image)
+        gallery_images = processed_images
     if mode == "FLF2V" and len(gallery_images) >= 2:
         gallery_images = gallery_images[:2]
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
+                # Background removal checkbox (only for Ref2V mode)
+                remove_bg_checkbox = gr.Checkbox(
+                    label="Remove Background (Ref2V mode only)",
+                    value=False,
+                    info="Automatically remove background from input images when using Ref2V mode"
+                )
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
                 with gr.Row():
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
+    # Function to update checkbox visibility based on mode
+    def update_bg_removal_visibility(mode):
+        return gr.update(visible=(mode == "Ref2V"))
     # Update prompt when mode changes
     mode_radio.change(
         fn=update_prompt_from_mode,
         outputs=[prompt_input]
     )
+    # Update background removal checkbox visibility when mode changes
+    mode_radio.change(
+        fn=update_bg_removal_visibility,
+        inputs=[mode_radio],
+        outputs=[remove_bg_checkbox]
+    )
     # Update dimensions when gallery changes
     gallery_component.change(
         fn=handle_gallery_upload_for_dims_wan,
     ui_inputs = [
         gallery_component, mode_radio, prompt_input, height_input, width_input,
         negative_prompt_input, duration_seconds_input,
+        guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox, remove_bg_checkbox
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])