Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,650 Bytes
6934c92 c19ca3d cecea50 6934c92 cecea50 64076fa 136da9a a0f4ef6 6934c92 c19ca3d a0f4ef6 398c57e c19ca3d 64076fa c19ca3d cecea50 c19ca3d 3ab7af1 ec8e06b 19fb882 3b76114 a0f4ef6 c19ca3d a0f4ef6 77feb08 c19ca3d 64076fa a0f4ef6 64076fa c19ca3d a0f4ef6 c19ca3d a0f4ef6 c19ca3d 64076fa 77feb08 136da9a 64076fa a0f4ef6 64076fa a0f4ef6 64076fa c19ca3d ba5d0dc dacf550 24b78e9 c19ca3d ba5d0dc c19ca3d ba5d0dc 4dce6a9 a0f4ef6 c19ca3d a0f4ef6 136da9a c19ca3d 64076fa c19ca3d 6934c92 3b76114 05463dd 6934c92 c1dedc3 c19ca3d 6934c92 cecea50 c19ca3d a0f4ef6 c19ca3d 64076fa a0f4ef6 64076fa c19ca3d 3aca9f1 6934c92 cecea50 a0f4ef6 64076fa b45c25c cecea50 3ab7af1 cecea50 c19ca3d 64076fa 136da9a 64076fa c19ca3d a0f4ef6 77feb08 a0f4ef6 64076fa 05463dd 6934c92 a0f4ef6 ec8e06b bfce642 c19ca3d 3aca9f1 6934c92 3aca9f1 05463dd 6934c92 c19ca3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import torch
from diffusers import AutoencoderKLWan, WanPipeline, WanImageToVideoPipeline, UniPCMultistepScheduler
from diffusers.utils import export_to_video
import gradio as gr
import tempfile
import spaces
import numpy as np
from PIL import Image
import random
MODEL_ID = "FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers"
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
# Initialize pipelines
text_to_video_pipe = WanPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)
image_to_video_pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)
for pipe in [text_to_video_pipe, image_to_video_pipe]:
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
pipe.to("cuda")
# Constants
MOD_VALUE = 32
DEFAULT_H_SLIDER_VALUE = 896
DEFAULT_W_SLIDER_VALUE = 896
NEW_FORMULA_MAX_AREA = 720 * 1024
SLIDER_MIN_H, SLIDER_MAX_H = 256, 1024
SLIDER_MIN_W, SLIDER_MAX_W = 256, 1024
MAX_SEED = np.iinfo(np.int32).max
FIXED_FPS = 24
MIN_FRAMES_MODEL = 25
MAX_FRAMES_MODEL = 193
default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area, min_slider_h, max_slider_h, min_slider_w, max_slider_w, default_h, default_w):
orig_w, orig_h = pil_image.size
if orig_w <= 0 or orig_h <= 0:
return default_h, default_w
aspect_ratio = orig_h / orig_w
calc_h = round(np.sqrt(calculation_max_area * aspect_ratio))
calc_w = round(np.sqrt(calculation_max_area / aspect_ratio))
calc_h = max(mod_val, (calc_h // mod_val) * mod_val)
calc_w = max(mod_val, (calc_w // mod_val) * mod_val)
new_h = int(np.clip(calc_h, min_slider_h, (max_slider_h // mod_val) * mod_val))
new_w = int(np.clip(calc_w, min_slider_w, (max_slider_w // mod_val) * mod_val))
return new_h, new_w
def handle_image_upload_for_dims_wan(uploaded_pil_image, current_h_val, current_w_val):
if uploaded_pil_image is None:
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
try:
new_h, new_w = _calculate_new_dimensions_wan(
uploaded_pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
)
return gr.update(value=new_h), gr.update(value=new_w)
except Exception as e:
gr.Warning("Error attempting to calculate new dimensions")
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
def get_duration(input_image, prompt, height, width,
negative_prompt, duration_seconds,
guidance_scale, steps,
seed, randomize_seed,
progress):
if steps > 4 and duration_seconds > 4:
return 90
elif steps > 4 or duration_seconds > 4:
return 75
else:
return 60
@spaces.GPU(duration=get_duration)
def generate_video(input_image, prompt, height, width, negative_prompt=default_negative_prompt, duration_seconds=2, guidance_scale=0, steps=4, seed=44, randomize_seed=False, progress=gr.Progress(track_tqdm=True)):
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
if input_image is not None:
resized_image = input_image.resize((target_w, target_h))
with torch.inference_mode():
output_frames_list = image_to_video_pipe(
image=resized_image, prompt=prompt, negative_prompt=negative_prompt,
height=target_h, width=target_w, num_frames=num_frames,
guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed)
).frames[0]
else:
with torch.inference_mode():
output_frames_list = text_to_video_pipe(
prompt=prompt, negative_prompt=negative_prompt,
height=target_h, width=target_w, num_frames=num_frames,
guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed)
).frames[0]
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
video_path = tmpfile.name
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
return video_path, current_seed
with gr.Blocks() as demo:
gr.Markdown("# Fast Wan 2.2 TI2V 5B Demo")
gr.Markdown("""This Demo is using [FastWan2.2-TI2V-5B](https://huggingface.co/FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers) which is fine-tuned with Sparse-distill method which allows wan to generate high quality videos in 3-5 steps.""")
with gr.Row():
with gr.Column():
input_image_component = gr.Image(type="pil", label="Input Image (optional, auto-resized to target H/W)")
prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
duration_seconds_input = gr.Slider(minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1), maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1), step=0.1, value=2, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
with gr.Accordion("Advanced Settings", open=False):
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
with gr.Row():
height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
steps_slider = gr.Slider(minimum=1, maximum=8, step=1, value=4, label="Inference Steps")
guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.01, value=0.0, label="Guidance Scale")
generate_button = gr.Button("Generate Video", variant="primary")
with gr.Column():
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
input_image_component.upload(
fn=handle_image_upload_for_dims_wan,
inputs=[input_image_component, height_input, width_input],
outputs=[height_input, width_input]
)
input_image_component.clear(
fn=handle_image_upload_for_dims_wan,
inputs=[input_image_component, height_input, width_input],
outputs=[height_input, width_input]
)
ui_inputs = [
input_image_component, prompt_input, height_input, width_input,
negative_prompt_input, duration_seconds_input,
guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox
]
generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
gr.Examples(
examples=[
[None, "A person eating spaghetti", 1024, 720],
["cat.png", "The cat removes the glasses from its eyes.", 1088, 800],
[None, "a penguin playfully dancing in the snow, Antarctica", 1024, 720],
["peng.png", "a penguin running towards camera joyfully, Antarctica", 896, 512],
],
inputs=[input_image_component, prompt_input, height_input, width_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
)
if __name__ == "__main__":
demo.queue().launch() |