linoyts HF Staff commited on
Commit
a298541
·
verified ·
1 Parent(s): dc06ac2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -38
app.py CHANGED
@@ -21,12 +21,12 @@ pipe.load_lora_weights(
21
  weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
22
  adapter_name="phantom"
23
  )
24
- pipe.load_lora_weights(
25
- "vrgamedevgirl84/Wan14BT2VFusioniX",
26
- weight_name="OtherLoRa's/DetailEnhancerV1.safetensors", adapter_name="detailer"
27
- )
28
- pipe.set_adapters(["phantom","detailer"], adapter_weights=[1, .9])
29
- pipe.fuse_lora()
30
 
31
  MOD_VALUE = 32
32
  DEFAULT_H_SLIDER_VALUE = 512
@@ -77,7 +77,7 @@ def handle_gallery_upload_for_dims_wan(gallery_images, current_h_val, current_w_
77
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
78
  try:
79
  # Use the first image to calculate dimensions
80
- first_image = gallery_images[0]
81
  new_h, new_w = _calculate_new_dimensions_wan(
82
  first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
83
  SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
@@ -118,6 +118,49 @@ def prepare_video_and_mask_FLF2V(first_img: Image.Image, last_img: Image.Image,
118
  mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
119
  return frames, mask
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
122
  images = [img.resize((width, height)) for img in images]
123
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
@@ -176,6 +219,8 @@ def generate_video(gallery_images, mode, prompt, height, width,
176
  """
177
  if gallery_images is None or len(gallery_images) == 0:
178
  raise gr.Error("Please upload at least one image to the gallery.")
 
 
179
 
180
  if mode == "FLF2V" and len(gallery_images) >= 2:
181
  gallery_images = gallery_images[:2]
@@ -201,26 +246,14 @@ def generate_video(gallery_images, mode, prompt, height, width,
201
  reference_images = None
202
  elif mode == "Ref2V":
203
  frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
204
- # Resize reference images to match target dimensions
205
- reference_images = [img.resize((target_w, target_h)) for img in gallery_images]
206
  else: # mode == "Random2V"
207
- # Calculate appropriate frame indices based on number of images and frames
208
- num_images = len(gallery_images)
209
- if num_images == 1:
210
- frame_indices = [num_frames // 2] # Place single image in the middle
211
- elif num_images == 2:
212
- frame_indices = [0, num_frames - 1] # Place at start and end
213
- else:
214
- # Distribute images evenly across the video
215
- # Ensure we don't exceed available frames
216
- max_images = min(num_images, num_frames)
217
- step = max(1, num_frames // max_images)
218
- frame_indices = [min(i * step, num_frames - 1) for i in range(max_images)]
219
- gallery_images = gallery_images[:max_images] # Limit images to what we can use
220
 
221
  frames, mask = prepare_video_and_mask_Random2V(
222
  images=gallery_images,
223
- frame_indices=frame_indices,
224
  height=target_h,
225
  width=target_w,
226
  num_frames=num_frames
@@ -247,9 +280,20 @@ def generate_video(gallery_images, mode, prompt, height, width,
247
  export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
248
  return video_path, current_seed
249
 
 
 
 
 
 
 
 
 
 
 
250
  with gr.Blocks() as demo:
251
- gr.Markdown("# Wan 2.1 VACE (14B) with Phantom & Detail Enhancer LoRAs - Multi-Image Gallery")
252
- gr.Markdown("Using [Wan2.1-VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) with Phantom FusionX and Detail Enhancer LoRAs for advanced video generation with multiple conditioning modes.")
 
253
 
254
  with gr.Row():
255
  with gr.Column():
@@ -270,8 +314,8 @@ with gr.Blocks() as demo:
270
  mode_radio = gr.Radio(
271
  choices=["Ref2V", "FLF2V", "Random2V"],
272
  value="Ref2V",
273
- label="Processing Mode",
274
- info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random frames to Video"
275
  )
276
 
277
  prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
@@ -279,7 +323,7 @@ with gr.Blocks() as demo:
279
  minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
280
  maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
281
  step=0.1,
282
- value=2,
283
  label="Duration (seconds)",
284
  info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
285
  )
@@ -291,22 +335,13 @@ with gr.Blocks() as demo:
291
  with gr.Row():
292
  height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
293
  width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
294
- steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
295
  guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
296
 
297
  generate_button = gr.Button("Generate Video", variant="primary")
298
 
299
  with gr.Column():
300
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
301
- with gr.Accordion("Mode Information", open=True):
302
- gr.Markdown("""
303
- **Processing Modes:**
304
- - **Ref2V**: Uses uploaded images as style references for video generation. All frames are generated based on the reference images.
305
- - **FLF2V**: First-Last Frame mode - uses first and last images as keyframes and generates the frames in between (requires exactly 2 images)
306
- - **Random2V**: Places uploaded images at specific frames in the video and generates the rest. Images are distributed evenly across the video duration.
307
-
308
- **Note**: VACE pipeline supports advanced conditioning with masks and reference images for more control over generation.
309
- """)
310
 
311
  # Update prompt when mode changes
312
  mode_radio.change(
@@ -329,6 +364,13 @@ with gr.Blocks() as demo:
329
  ]
330
 
331
  generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
 
 
 
 
 
 
 
332
 
333
  if __name__ == "__main__":
334
  demo.queue().launch(mcp_server=True)
 
21
  weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
22
  adapter_name="phantom"
23
  )
24
+ # pipe.load_lora_weights(
25
+ # "vrgamedevgirl84/Wan14BT2VFusioniX",
26
+ # weight_name="OtherLoRa's/DetailEnhancerV1.safetensors", adapter_name="detailer"
27
+ # )
28
+ # pipe.set_adapters(["phantom","detailer"], adapter_weights=[1, .9])
29
+ # pipe.fuse_lora()
30
 
31
  MOD_VALUE = 32
32
  DEFAULT_H_SLIDER_VALUE = 512
 
77
  return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
78
  try:
79
  # Use the first image to calculate dimensions
80
+ first_image = gallery_images[0][0]
81
  new_h, new_w = _calculate_new_dimensions_wan(
82
  first_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
83
  SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
 
118
  mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
119
  return frames, mask
120
 
121
+ def calculate_random2v_frame_indices(num_images: int, num_frames: int) -> List[int]:
122
+ """
123
+ Calculate evenly spaced frame indices for Random2V mode.
124
+
125
+ Args:
126
+ num_images (int): Number of input images
127
+ num_frames (int): Total number of frames in the video
128
+
129
+ Returns:
130
+ List[int]: Frame indices where images should be placed
131
+ """
132
+ if num_images <= 0:
133
+ return []
134
+
135
+ if num_images == 1:
136
+ # Single image goes in the middle
137
+ return [num_frames // 2]
138
+
139
+ if num_images >= num_frames:
140
+ # More images than frames, use every frame
141
+ return list(range(num_frames))
142
+
143
+ # Calculate evenly spaced indices
144
+ # We want to distribute images across the full duration
145
+ indices = []
146
+ step = (num_frames - 1) / (num_images - 1)
147
+
148
+ for i in range(num_images):
149
+ frame_idx = int(round(i * step))
150
+ # Ensure we don't exceed num_frames - 1
151
+ frame_idx = min(frame_idx, num_frames - 1)
152
+ indices.append(frame_idx)
153
+
154
+ # Remove duplicates while preserving order
155
+ seen = set()
156
+ unique_indices = []
157
+ for idx in indices:
158
+ if idx not in seen:
159
+ seen.add(idx)
160
+ unique_indices.append(idx)
161
+
162
+ return unique_indices
163
+
164
  def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
165
  images = [img.resize((width, height)) for img in images]
166
  # Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
 
219
  """
220
  if gallery_images is None or len(gallery_images) == 0:
221
  raise gr.Error("Please upload at least one image to the gallery.")
222
+ else:
223
+ gallery_images = [img[0] for img in gallery_images]
224
 
225
  if mode == "FLF2V" and len(gallery_images) >= 2:
226
  gallery_images = gallery_images[:2]
 
246
  reference_images = None
247
  elif mode == "Ref2V":
248
  frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
249
+ reference_images = gallery_images
 
250
  else: # mode == "Random2V"
251
+ # Calculate dynamic frame indices based on number of images and frames
252
+ frame_indices = calculate_random2v_frame_indices(len(gallery_images), num_frames)
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  frames, mask = prepare_video_and_mask_Random2V(
255
  images=gallery_images,
256
+ frame_indices=frame_indices,
257
  height=target_h,
258
  width=target_w,
259
  num_frames=num_frames
 
280
  export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
281
  return video_path, current_seed
282
 
283
+ control_modes = """
284
+ **3 control modes avilable:**
285
+
286
+ **Ref2V (Reference-to-Video)** Generate a video incorporating elements from input reference images
287
+
288
+ **FLF2V (First-Last Frame-to-Video)** Generate a video using first and last frame conditioning defined by input images
289
+
290
+ **Random2V (Random-to-Video)** Generate a video with intermediate transitions between multiple input images
291
+ """
292
+
293
  with gr.Blocks() as demo:
294
+ gr.Markdown("# Fast 6 step Wan 2.1 VACE (14B)")
295
+ gr.Markdown("Using [**Wan2.1-VACE-14B**](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) + [**👻FusionX Phantom LoRA**](https://huggingface.co/vrgamedevgirl84/Wan14BT2VFusioniX) by [**vrgamedevgirl84**](https://huggingface.co/vrgamedevgirl84) with **🧨diffusers**, for fast video generation with multiple conditions 🏎️")
296
+ gr.Markdown(f"{control_modes}")
297
 
298
  with gr.Row():
299
  with gr.Column():
 
314
  mode_radio = gr.Radio(
315
  choices=["Ref2V", "FLF2V", "Random2V"],
316
  value="Ref2V",
317
+ label="Control Mode",
318
+ info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random to Video"
319
  )
320
 
321
  prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
 
323
  minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
324
  maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
325
  step=0.1,
326
+ value=2.8,
327
  label="Duration (seconds)",
328
  info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
329
  )
 
335
  with gr.Row():
336
  height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
337
  width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
338
+ steps_slider = gr.Slider(minimum=1, maximum=10, step=1, value=6, label="Inference Steps")
339
  guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
340
 
341
  generate_button = gr.Button("Generate Video", variant="primary")
342
 
343
  with gr.Column():
344
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
 
 
 
 
 
 
 
 
 
345
 
346
  # Update prompt when mode changes
347
  mode_radio.change(
 
364
  ]
365
 
366
  generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
367
+ gr.Examples(
368
+ examples=[
369
+ [["reachy.png", "sunglasses.jpg", "gpu_hat.png"], "Ref2V", "the cute robot is wearing the sunglasses and the hat that reads 'GPU poor', and moves around playfully", 480, 832],
370
+ [["flf2v_input_first_frame.png", "flf2v_input_last_frame.png"], "FLF2V", "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective.", 512, 512],
371
+ ],
372
+ inputs=[gallery_component, mode_radio, prompt_input, height_input, width_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
373
+ )
374
 
375
  if __name__ == "__main__":
376
  demo.queue().launch(mcp_server=True)