Lemonator commited on
Commit
71c32c3
·
verified ·
1 Parent(s): 4c12131

Update app_lora.py

Browse files

Fixed Wan2.1 I2V Gradio App
Code
I've fixed several key issues in your code to properly support 240-frame video generation:

Key Changes Made:
Memory Management:
Changed from bfloat16 to float16 for better compatibility
Added enable_model_cpu_offload(), enable_vae_slicing(), and enable_vae_tiling() for memory efficiency
Added GPU cache clearing before and after generation
Used torch.autocast for automatic mixed precision
Dynamic Resolution Scaling:
For videos longer than 120 frames (5 seconds), automatically reduces resolution if above 768px
This prevents out-of-memory errors for long videos
Improved Video Export:
Added support for imageio as the primary export method (better quality)
Falls back to OpenCV if imageio is not available
Fixed FFmpeg optimization with proper error handling
Better Error Handling:
Specific handling for CUDA out-of-memory errors
Proper file existence checks before FFmpeg optimization
More informative error messages
UI Improvements:
Added queue size limit to prevent overload
Added tips for users about optimal settings for long videos
Better progress indication
Frame Calculation:
Kept the proper frame alignment (num_frames-1 divisible by 4)
Clear logging of requested vs actual frames

Files changed (1) hide show
  1. app_lora.py +133 -51
app_lora.py CHANGED
@@ -5,6 +5,8 @@ from diffusers.utils import export_to_video
5
  from transformers import CLIPVisionModel
6
  import gradio as gr
7
  import tempfile
 
 
8
 
9
  from huggingface_hub import hf_hub_download
10
  import numpy as np
@@ -19,15 +21,19 @@ MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
19
 
20
  LORA_REPO_ID = "vrgamedevgirl84/Wan14BT2VFusioniX"
21
  LORA_FILENAME = "FusionX_LoRa/Wan2.1_I2V_14B_FusionX_LoRA.safetensors"
22
- #LORA_FILENAME = "Wan14Bi2vFusioniX_fp16.safetensors"
23
 
24
- image_encoder = CLIPVisionModel.from_pretrained(MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32)
25
- vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
 
26
  pipe = WanImageToVideoPipeline.from_pretrained(
27
- MODEL_ID, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
28
  )
29
  pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
30
- pipe.to("cuda")
 
 
 
 
31
 
32
  try:
33
  causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
@@ -53,7 +59,7 @@ MAX_SEED = np.iinfo(np.int32).max
53
 
54
  FIXED_FPS = 24
55
  MIN_FRAMES_MODEL = 8 # Minimum 8 frames (~0.33s)
56
- MAX_FRAMES_MODEL = 240 # Changed from 81 to 240 (10 seconds at 24fps)
57
 
58
  default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
59
  default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
@@ -99,13 +105,31 @@ def get_duration(input_image, prompt, height, width,
99
  guidance_scale, steps,
100
  seed, randomize_seed,
101
  progress):
102
- # Longer timeout for longer videos
103
- if steps > 4 and duration_seconds > 5: # Changed from 2 to 5
104
- return 120 # Increased timeout for long generations
105
- elif steps > 4 or duration_seconds > 5: # Changed from 2 to 5
106
- return 90
 
 
107
  else:
108
- return 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  @spaces.GPU(duration=get_duration)
111
  def generate_video(input_image, prompt, height, width,
@@ -120,66 +144,117 @@ def generate_video(input_image, prompt, height, width,
120
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
121
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
122
 
123
- # Ensure num_frames-1 is divisible by 4
124
  raw_frames = int(round(duration_seconds * FIXED_FPS))
 
125
  num_frames = ((raw_frames - 1) // 4) * 4 + 1
126
  num_frames = np.clip(num_frames, MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
127
- print(f"Using {num_frames} frames (requested {raw_frames})")
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
130
 
131
- resized_image = input_image.resize((target_w, target_h))
132
 
133
- with torch.inference_mode():
134
- output_frames_list = pipe(
135
- image=resized_image, prompt=prompt, negative_prompt=negative_prompt,
136
- height=target_h, width=target_w, num_frames=num_frames,
137
- guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
138
- generator=torch.Generator(device="cuda").manual_seed(current_seed)
139
- ).frames[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
142
  video_path = tmpfile.name
143
 
144
- # Simple export that works with current diffusers version
145
- export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
146
 
147
- # Optional: Use FFmpeg directly for better encoding if available
148
- try:
149
- import subprocess
150
- optimized_path = video_path + "_opt.mp4"
151
- subprocess.run([
152
- 'ffmpeg',
153
- '-y', # Overwrite without asking
154
- '-i', video_path, # Input file
155
- '-c:v', 'libx264', # Codec
156
- '-pix_fmt', 'yuv420p', # Pixel format
157
- '-profile:v', 'main', # Compatibility profile
158
- '-movflags', '+faststart', # Streaming optimized
159
- '-crf', '23', # Quality level
160
- '-preset', 'fast', # Encoding speed
161
- optimized_path
162
- ], check=True)
163
- video_path = optimized_path
164
- except Exception as e:
165
- print(f"FFmpeg optimization failed, using default export: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  return video_path, current_seed
168
 
 
169
  with gr.Blocks() as demo:
170
- gr.Markdown("# Fast 4 steps Wan 2.1 I2V (14B) fusionx-lora")
171
- #gr.Markdown("[CausVid](https://github.com/tianweiy/CausVid) is a distilled version of Wan 2.1 to run faster in just 4-8 steps, [extracted as LoRA by Kijai](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors) and is compatible with 🧨 diffusers")
 
172
  with gr.Row():
173
  with gr.Column():
174
  input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
175
  prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
176
  duration_seconds_input = gr.Slider(
177
- minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1), # 0.3s (8 frames)
178
- maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1), # Now 10.0s (240 frames)
179
  step=0.1,
180
- value=3, # Changed default from 2 to 3 seconds
181
  label="Duration (seconds)",
182
- info=f"Clamped to {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps (~{MIN_FRAMES_MODEL/FIXED_FPS:.1f}-{MAX_FRAMES_MODEL/FIXED_FPS:.1f}s)"
183
  )
184
  with gr.Accordion("Advanced Settings", open=False):
185
  negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
@@ -194,6 +269,10 @@ with gr.Blocks() as demo:
194
  generate_button = gr.Button("Generate Video", variant="primary")
195
  with gr.Column():
196
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
 
 
 
 
197
 
198
  input_image_component.upload(
199
  fn=handle_image_upload_for_dims_wan,
@@ -219,8 +298,11 @@ with gr.Blocks() as demo:
219
  ["peng.png", "a penguin playfully dancing in the snow, Antarctica", 896, 512],
220
  ["forg.jpg", "the frog jumps around", 448, 832],
221
  ],
222
- inputs=[input_image_component, prompt_input, height_input, width_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
 
 
 
223
  )
224
 
225
  if __name__ == "__main__":
226
- demo.queue().launch()
 
5
  from transformers import CLIPVisionModel
6
  import gradio as gr
7
  import tempfile
8
+ import os
9
+ import subprocess
10
 
11
  from huggingface_hub import hf_hub_download
12
  import numpy as np
 
21
 
22
  LORA_REPO_ID = "vrgamedevgirl84/Wan14BT2VFusioniX"
23
  LORA_FILENAME = "FusionX_LoRa/Wan2.1_I2V_14B_FusionX_LoRA.safetensors"
 
24
 
25
+ # Initialize models with proper dtype handling
26
+ image_encoder = CLIPVisionModel.from_pretrained(MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float16)
27
+ vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float16)
28
  pipe = WanImageToVideoPipeline.from_pretrained(
29
+ MODEL_ID, vae=vae, image_encoder=image_encoder, torch_dtype=torch.float16
30
  )
31
  pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
32
+
33
+ # Enable memory efficient attention and CPU offloading for large videos
34
+ pipe.enable_model_cpu_offload()
35
+ pipe.enable_vae_slicing()
36
+ pipe.enable_vae_tiling()
37
 
38
  try:
39
  causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
 
59
 
60
  FIXED_FPS = 24
61
  MIN_FRAMES_MODEL = 8 # Minimum 8 frames (~0.33s)
62
+ MAX_FRAMES_MODEL = 240 # Maximum 240 frames (10 seconds at 24fps)
63
 
64
  default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
65
  default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
 
105
  guidance_scale, steps,
106
  seed, randomize_seed,
107
  progress):
108
+ # Adjust timeout based on video length and complexity
109
+ if duration_seconds > 7:
110
+ return 180 # 3 minutes for very long videos
111
+ elif duration_seconds > 5:
112
+ return 120 # 2 minutes for long videos
113
+ elif duration_seconds > 3:
114
+ return 90 # 1.5 minutes for medium videos
115
  else:
116
+ return 60 # 1 minute for short videos
117
+
118
+ def export_video_with_ffmpeg(frames, output_path, fps=24):
119
+ """Export video using imageio if available, otherwise fall back to OpenCV"""
120
+ try:
121
+ import imageio
122
+ # Use imageio for better quality
123
+ writer = imageio.get_writer(output_path, fps=fps, codec='libx264',
124
+ pixelformat='yuv420p', quality=8)
125
+ for frame in frames:
126
+ writer.append_data(np.array(frame))
127
+ writer.close()
128
+ return True
129
+ except ImportError:
130
+ # Fall back to OpenCV
131
+ export_to_video(frames, output_path, fps=fps)
132
+ return False
133
 
134
  @spaces.GPU(duration=get_duration)
135
  def generate_video(input_image, prompt, height, width,
 
144
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
145
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
146
 
147
+ # Calculate frames with proper alignment
148
  raw_frames = int(round(duration_seconds * FIXED_FPS))
149
+ # Ensure num_frames-1 is divisible by 4 as required by the model
150
  num_frames = ((raw_frames - 1) // 4) * 4 + 1
151
  num_frames = np.clip(num_frames, MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
152
+
153
+ # Additional check for very long videos
154
+ if num_frames > 120:
155
+ # For videos longer than 5 seconds, reduce resolution to manage memory
156
+ max_dim = max(target_h, target_w)
157
+ if max_dim > 768:
158
+ scale_factor = 768 / max_dim
159
+ target_h = max(MOD_VALUE, (int(target_h * scale_factor) // MOD_VALUE) * MOD_VALUE)
160
+ target_w = max(MOD_VALUE, (int(target_w * scale_factor) // MOD_VALUE) * MOD_VALUE)
161
+ gr.Info(f"Reduced resolution to {target_w}x{target_h} for long video generation")
162
+
163
+ print(f"Generating {num_frames} frames (requested {raw_frames}) at {target_w}x{target_h}")
164
 
165
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
166
 
167
+ resized_image = input_image.resize((target_w, target_h), Image.Resampling.LANCZOS)
168
 
169
+ # Clear GPU cache before generation
170
+ if torch.cuda.is_available():
171
+ torch.cuda.empty_cache()
172
+
173
+ try:
174
+ with torch.inference_mode():
175
+ # Generate video with autocast for memory efficiency
176
+ with torch.autocast("cuda", dtype=torch.float16):
177
+ output_frames_list = pipe(
178
+ image=resized_image,
179
+ prompt=prompt,
180
+ negative_prompt=negative_prompt,
181
+ height=target_h,
182
+ width=target_w,
183
+ num_frames=num_frames,
184
+ guidance_scale=float(guidance_scale),
185
+ num_inference_steps=int(steps),
186
+ generator=torch.Generator(device="cuda").manual_seed(current_seed),
187
+ return_dict=True
188
+ ).frames[0]
189
+ except torch.cuda.OutOfMemoryError:
190
+ torch.cuda.empty_cache()
191
+ raise gr.Error("Out of GPU memory. Try reducing the duration or resolution.")
192
+ except Exception as e:
193
+ torch.cuda.empty_cache()
194
+ raise gr.Error(f"Generation failed: {str(e)}")
195
+
196
+ # Clear cache after generation
197
+ if torch.cuda.is_available():
198
+ torch.cuda.empty_cache()
199
 
200
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
201
  video_path = tmpfile.name
202
 
203
+ # Export using imageio if available, otherwise OpenCV
204
+ used_imageio = export_video_with_ffmpeg(output_frames_list, video_path, fps=FIXED_FPS)
205
 
206
+ # Only try FFmpeg optimization if we have a valid video file
207
+ if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
208
+ try:
209
+ # Check if ffmpeg is available
210
+ subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
211
+
212
+ optimized_path = video_path + "_opt.mp4"
213
+ cmd = [
214
+ 'ffmpeg',
215
+ '-y', # Overwrite without asking
216
+ '-i', video_path, # Input file
217
+ '-c:v', 'libx264', # Codec
218
+ '-pix_fmt', 'yuv420p', # Pixel format
219
+ '-profile:v', 'main', # Compatibility profile
220
+ '-level', '4.0', # Support for higher resolutions
221
+ '-movflags', '+faststart', # Streaming optimized
222
+ '-crf', '23', # Quality level
223
+ '-preset', 'medium', # Balance between speed and compression
224
+ '-maxrate', '10M', # Max bitrate for large videos
225
+ '-bufsize', '20M', # Buffer size
226
+ optimized_path
227
+ ]
228
+
229
+ result = subprocess.run(cmd, capture_output=True, text=True)
230
+
231
+ if result.returncode == 0 and os.path.exists(optimized_path) and os.path.getsize(optimized_path) > 0:
232
+ os.unlink(video_path) # Remove original
233
+ video_path = optimized_path
234
+ else:
235
+ print(f"FFmpeg optimization failed: {result.stderr}")
236
+
237
+ except (subprocess.CalledProcessError, FileNotFoundError):
238
+ print("FFmpeg not available or optimization failed, using original export")
239
 
240
  return video_path, current_seed
241
 
242
+ # Gradio Interface
243
  with gr.Blocks() as demo:
244
+ gr.Markdown("# Fast 4 steps Wan 2.1 I2V (14B) FusionX-LoRA")
245
+ gr.Markdown("Generate videos up to 10 seconds long! Longer videos may use reduced resolution for stability.")
246
+
247
  with gr.Row():
248
  with gr.Column():
249
  input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
250
  prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
251
  duration_seconds_input = gr.Slider(
252
+ minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1), # 0.3s (8 frames)
253
+ maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1), # 10.0s (240 frames)
254
  step=0.1,
255
+ value=2, # Default 2 seconds
256
  label="Duration (seconds)",
257
+ info=f"Video length: {MIN_FRAMES_MODEL/FIXED_FPS:.1f}-{MAX_FRAMES_MODEL/FIXED_FPS:.1f}s. Longer videos may take more time and use more memory."
258
  )
259
  with gr.Accordion("Advanced Settings", open=False):
260
  negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
 
269
  generate_button = gr.Button("Generate Video", variant="primary")
270
  with gr.Column():
271
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
272
+ gr.Markdown("### Tips for best results:")
273
+ gr.Markdown("- For videos longer than 5 seconds, consider using lower resolutions (512-768px)")
274
+ gr.Markdown("- Clear, simple prompts often work better than complex descriptions")
275
+ gr.Markdown("- The model works best with 4-8 inference steps")
276
 
277
  input_image_component.upload(
278
  fn=handle_image_upload_for_dims_wan,
 
298
  ["peng.png", "a penguin playfully dancing in the snow, Antarctica", 896, 512],
299
  ["forg.jpg", "the frog jumps around", 448, 832],
300
  ],
301
+ inputs=[input_image_component, prompt_input, height_input, width_input],
302
+ outputs=[video_output, seed_input],
303
+ fn=generate_video,
304
+ cache_examples="lazy"
305
  )
306
 
307
  if __name__ == "__main__":
308
+ demo.queue(max_size=3).launch()