Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,12 +21,12 @@ pipe.load_lora_weights(
|
|
| 21 |
weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
|
| 22 |
adapter_name="phantom"
|
| 23 |
)
|
| 24 |
-
pipe.load_lora_weights(
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
)
|
| 28 |
-
pipe.set_adapters(["phantom","detailer"], adapter_weights=[1, .9])
|
| 29 |
-
pipe.fuse_lora()
|
| 30 |
|
| 31 |
MOD_VALUE = 32
|
| 32 |
DEFAULT_H_SLIDER_VALUE = 512
|
|
@@ -77,7 +77,7 @@ def handle_gallery_upload_for_dims_wan(gallery_images, current_h_val, current_w_
|
|
| 77 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
| 78 |
try:
|
| 79 |
# Use the first image to calculate dimensions
|
| 80 |
-
first_image = gallery_images[0]
|
| 81 |
new_h, new_w = _calculate_new_dimensions_wan(
|
| 82 |
first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
|
| 83 |
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
|
|
@@ -118,6 +118,49 @@ def prepare_video_and_mask_FLF2V(first_img: Image.Image, last_img: Image.Image,
|
|
| 118 |
mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
|
| 119 |
return frames, mask
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
|
| 122 |
images = [img.resize((width, height)) for img in images]
|
| 123 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
|
@@ -176,6 +219,8 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
| 176 |
"""
|
| 177 |
if gallery_images is None or len(gallery_images) == 0:
|
| 178 |
raise gr.Error("Please upload at least one image to the gallery.")
|
|
|
|
|
|
|
| 179 |
|
| 180 |
if mode == "FLF2V" and len(gallery_images) >= 2:
|
| 181 |
gallery_images = gallery_images[:2]
|
|
@@ -201,26 +246,14 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
| 201 |
reference_images = None
|
| 202 |
elif mode == "Ref2V":
|
| 203 |
frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
|
| 204 |
-
|
| 205 |
-
reference_images = [img.resize((target_w, target_h)) for img in gallery_images]
|
| 206 |
else: # mode == "Random2V"
|
| 207 |
-
# Calculate
|
| 208 |
-
|
| 209 |
-
if num_images == 1:
|
| 210 |
-
frame_indices = [num_frames // 2] # Place single image in the middle
|
| 211 |
-
elif num_images == 2:
|
| 212 |
-
frame_indices = [0, num_frames - 1] # Place at start and end
|
| 213 |
-
else:
|
| 214 |
-
# Distribute images evenly across the video
|
| 215 |
-
# Ensure we don't exceed available frames
|
| 216 |
-
max_images = min(num_images, num_frames)
|
| 217 |
-
step = max(1, num_frames // max_images)
|
| 218 |
-
frame_indices = [min(i * step, num_frames - 1) for i in range(max_images)]
|
| 219 |
-
gallery_images = gallery_images[:max_images] # Limit images to what we can use
|
| 220 |
|
| 221 |
frames, mask = prepare_video_and_mask_Random2V(
|
| 222 |
images=gallery_images,
|
| 223 |
-
frame_indices=frame_indices,
|
| 224 |
height=target_h,
|
| 225 |
width=target_w,
|
| 226 |
num_frames=num_frames
|
|
@@ -247,9 +280,20 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
| 247 |
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
|
| 248 |
return video_path, current_seed
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
with gr.Blocks() as demo:
|
| 251 |
-
gr.Markdown("# Wan 2.1 VACE (14B)
|
| 252 |
-
gr.Markdown("Using [Wan2.1-VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers)
|
|
|
|
| 253 |
|
| 254 |
with gr.Row():
|
| 255 |
with gr.Column():
|
|
@@ -270,8 +314,8 @@ with gr.Blocks() as demo:
|
|
| 270 |
mode_radio = gr.Radio(
|
| 271 |
choices=["Ref2V", "FLF2V", "Random2V"],
|
| 272 |
value="Ref2V",
|
| 273 |
-
label="
|
| 274 |
-
info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random
|
| 275 |
)
|
| 276 |
|
| 277 |
prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
|
|
@@ -279,7 +323,7 @@ with gr.Blocks() as demo:
|
|
| 279 |
minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
|
| 280 |
maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
|
| 281 |
step=0.1,
|
| 282 |
-
value=2,
|
| 283 |
label="Duration (seconds)",
|
| 284 |
info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
|
| 285 |
)
|
|
@@ -291,22 +335,13 @@ with gr.Blocks() as demo:
|
|
| 291 |
with gr.Row():
|
| 292 |
height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
|
| 293 |
width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
|
| 294 |
-
steps_slider = gr.Slider(minimum=1, maximum=
|
| 295 |
guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
|
| 296 |
|
| 297 |
generate_button = gr.Button("Generate Video", variant="primary")
|
| 298 |
|
| 299 |
with gr.Column():
|
| 300 |
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
|
| 301 |
-
with gr.Accordion("Mode Information", open=True):
|
| 302 |
-
gr.Markdown("""
|
| 303 |
-
**Processing Modes:**
|
| 304 |
-
- **Ref2V**: Uses uploaded images as style references for video generation. All frames are generated based on the reference images.
|
| 305 |
-
- **FLF2V**: First-Last Frame mode - uses first and last images as keyframes and generates the frames in between (requires exactly 2 images)
|
| 306 |
-
- **Random2V**: Places uploaded images at specific frames in the video and generates the rest. Images are distributed evenly across the video duration.
|
| 307 |
-
|
| 308 |
-
**Note**: VACE pipeline supports advanced conditioning with masks and reference images for more control over generation.
|
| 309 |
-
""")
|
| 310 |
|
| 311 |
# Update prompt when mode changes
|
| 312 |
mode_radio.change(
|
|
@@ -329,6 +364,13 @@ with gr.Blocks() as demo:
|
|
| 329 |
]
|
| 330 |
|
| 331 |
generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
| 333 |
if __name__ == "__main__":
|
| 334 |
demo.queue().launch(mcp_server=True)
|
|
|
|
| 21 |
weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
|
| 22 |
adapter_name="phantom"
|
| 23 |
)
|
| 24 |
+
# pipe.load_lora_weights(
|
| 25 |
+
# "vrgamedevgirl84/Wan14BT2VFusioniX",
|
| 26 |
+
# weight_name="OtherLoRa's/DetailEnhancerV1.safetensors", adapter_name="detailer"
|
| 27 |
+
# )
|
| 28 |
+
# pipe.set_adapters(["phantom","detailer"], adapter_weights=[1, .9])
|
| 29 |
+
# pipe.fuse_lora()
|
| 30 |
|
| 31 |
MOD_VALUE = 32
|
| 32 |
DEFAULT_H_SLIDER_VALUE = 512
|
|
|
|
| 77 |
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
| 78 |
try:
|
| 79 |
# Use the first image to calculate dimensions
|
| 80 |
+
first_image = gallery_images[0][0]
|
| 81 |
new_h, new_w = _calculate_new_dimensions_wan(
|
| 82 |
first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
|
| 83 |
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
|
|
|
|
| 118 |
mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
|
| 119 |
return frames, mask
|
| 120 |
|
| 121 |
+
def calculate_random2v_frame_indices(num_images: int, num_frames: int) -> List[int]:
|
| 122 |
+
"""
|
| 123 |
+
Calculate evenly spaced frame indices for Random2V mode.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
num_images (int): Number of input images
|
| 127 |
+
num_frames (int): Total number of frames in the video
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
List[int]: Frame indices where images should be placed
|
| 131 |
+
"""
|
| 132 |
+
if num_images <= 0:
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
if num_images == 1:
|
| 136 |
+
# Single image goes in the middle
|
| 137 |
+
return [num_frames // 2]
|
| 138 |
+
|
| 139 |
+
if num_images >= num_frames:
|
| 140 |
+
# More images than frames, use every frame
|
| 141 |
+
return list(range(num_frames))
|
| 142 |
+
|
| 143 |
+
# Calculate evenly spaced indices
|
| 144 |
+
# We want to distribute images across the full duration
|
| 145 |
+
indices = []
|
| 146 |
+
step = (num_frames - 1) / (num_images - 1)
|
| 147 |
+
|
| 148 |
+
for i in range(num_images):
|
| 149 |
+
frame_idx = int(round(i * step))
|
| 150 |
+
# Ensure we don't exceed num_frames - 1
|
| 151 |
+
frame_idx = min(frame_idx, num_frames - 1)
|
| 152 |
+
indices.append(frame_idx)
|
| 153 |
+
|
| 154 |
+
# Remove duplicates while preserving order
|
| 155 |
+
seen = set()
|
| 156 |
+
unique_indices = []
|
| 157 |
+
for idx in indices:
|
| 158 |
+
if idx not in seen:
|
| 159 |
+
seen.add(idx)
|
| 160 |
+
unique_indices.append(idx)
|
| 161 |
+
|
| 162 |
+
return unique_indices
|
| 163 |
+
|
| 164 |
def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
|
| 165 |
images = [img.resize((width, height)) for img in images]
|
| 166 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
|
|
|
| 219 |
"""
|
| 220 |
if gallery_images is None or len(gallery_images) == 0:
|
| 221 |
raise gr.Error("Please upload at least one image to the gallery.")
|
| 222 |
+
else:
|
| 223 |
+
gallery_images = [img[0] for img in gallery_images]
|
| 224 |
|
| 225 |
if mode == "FLF2V" and len(gallery_images) >= 2:
|
| 226 |
gallery_images = gallery_images[:2]
|
|
|
|
| 246 |
reference_images = None
|
| 247 |
elif mode == "Ref2V":
|
| 248 |
frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
|
| 249 |
+
reference_images = gallery_images
|
|
|
|
| 250 |
else: # mode == "Random2V"
|
| 251 |
+
# Calculate dynamic frame indices based on number of images and frames
|
| 252 |
+
frame_indices = calculate_random2v_frame_indices(len(gallery_images), num_frames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
frames, mask = prepare_video_and_mask_Random2V(
|
| 255 |
images=gallery_images,
|
| 256 |
+
frame_indices=frame_indices,
|
| 257 |
height=target_h,
|
| 258 |
width=target_w,
|
| 259 |
num_frames=num_frames
|
|
|
|
| 280 |
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
|
| 281 |
return video_path, current_seed
|
| 282 |
|
| 283 |
+
control_modes = """
|
| 284 |
+
**3 control modes avilable:**
|
| 285 |
+
|
| 286 |
+
**Ref2V (Reference-to-Video)** Generate a video incorporating elements from input reference images
|
| 287 |
+
|
| 288 |
+
**FLF2V (First-Last Frame-to-Video)** Generate a video using first and last frame conditioning defined by input images
|
| 289 |
+
|
| 290 |
+
**Random2V (Random-to-Video)** Generate a video with intermediate transitions between multiple input images
|
| 291 |
+
"""
|
| 292 |
+
|
| 293 |
with gr.Blocks() as demo:
|
| 294 |
+
gr.Markdown("# Fast 6 step Wan 2.1 VACE (14B)")
|
| 295 |
+
gr.Markdown("Using [**Wan2.1-VACE-14B**](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) + [**👻FusionX Phantom LoRA**](https://huggingface.co/vrgamedevgirl84/Wan14BT2VFusioniX) by [**vrgamedevgirl84**](https://huggingface.co/vrgamedevgirl84) with **🧨diffusers**, for fast video generation with multiple conditions 🏎️")
|
| 296 |
+
gr.Markdown(f"{control_modes}")
|
| 297 |
|
| 298 |
with gr.Row():
|
| 299 |
with gr.Column():
|
|
|
|
| 314 |
mode_radio = gr.Radio(
|
| 315 |
choices=["Ref2V", "FLF2V", "Random2V"],
|
| 316 |
value="Ref2V",
|
| 317 |
+
label="Control Mode",
|
| 318 |
+
info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random to Video"
|
| 319 |
)
|
| 320 |
|
| 321 |
prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
|
|
|
|
| 323 |
minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
|
| 324 |
maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
|
| 325 |
step=0.1,
|
| 326 |
+
value=2.8,
|
| 327 |
label="Duration (seconds)",
|
| 328 |
info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
|
| 329 |
)
|
|
|
|
| 335 |
with gr.Row():
|
| 336 |
height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
|
| 337 |
width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
|
| 338 |
+
steps_slider = gr.Slider(minimum=1, maximum=10, step=1, value=6, label="Inference Steps")
|
| 339 |
guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
|
| 340 |
|
| 341 |
generate_button = gr.Button("Generate Video", variant="primary")
|
| 342 |
|
| 343 |
with gr.Column():
|
| 344 |
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
# Update prompt when mode changes
|
| 347 |
mode_radio.change(
|
|
|
|
| 364 |
]
|
| 365 |
|
| 366 |
generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
|
| 367 |
+
gr.Examples(
|
| 368 |
+
examples=[
|
| 369 |
+
[["reachy.png", "sunglasses.jpg", "gpu_hat.png"], "Ref2V", "the cute robot is wearing the sunglasses and the hat that reads 'GPU poor', and moves around playfully", 480, 832],
|
| 370 |
+
[["flf2v_input_first_frame.png", "flf2v_input_last_frame.png"], "FLF2V", "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective.", 512, 512],
|
| 371 |
+
],
|
| 372 |
+
inputs=[gallery_component, mode_radio, prompt_input, height_input, width_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
|
| 373 |
+
)
|
| 374 |
|
| 375 |
if __name__ == "__main__":
|
| 376 |
demo.queue().launch(mcp_server=True)
|