Spaces:
Running
on
Zero
Running
on
Zero
File size: 11,235 Bytes
820453e 6934c92 c19ca3d cecea50 6934c92 64076fa a0f4ef6 6934c92 c19ca3d a0f4ef6 398c57e c19ca3d 64076fa c19ca3d cecea50 c19ca3d 3ab7af1 ec8e06b 19fb882 3b76114 a0f4ef6 c19ca3d a0f4ef6 77feb08 c19ca3d 64076fa a0f4ef6 64076fa c19ca3d a0f4ef6 c19ca3d a0f4ef6 c19ca3d 64076fa 77feb08 136da9a 64076fa a0f4ef6 64076fa a0f4ef6 64076fa 0c8edfe a4fb37e 0c8edfe 4e4f031 ba5d0dc 0c8edfe b70860f a0f4ef6 c19ca3d a0f4ef6 136da9a c19ca3d 64076fa c19ca3d 6934c92 3b76114 05463dd 6934c92 0c8edfe 6934c92 5e81e59 c19ca3d 0c8edfe b70860f 0c8edfe b330eb0 0c8edfe b330eb0 0c8edfe 6934c92 c19ca3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
import spaces
import torch
from diffusers import AutoencoderKLWan, WanPipeline, WanImageToVideoPipeline, UniPCMultistepScheduler
from diffusers.utils import export_to_video
import gradio as gr
import tempfile
import numpy as np
import random
MODEL_ID = "FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers"
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
# Initialize pipelines
text_to_video_pipe = WanPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)
image_to_video_pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID, vae=vae, torch_dtype=torch.bfloat16)
for pipe in [text_to_video_pipe, image_to_video_pipe]:
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
pipe.to("cuda")
# Constants
MOD_VALUE = 32
DEFAULT_H_SLIDER_VALUE = 896
DEFAULT_W_SLIDER_VALUE = 896
NEW_FORMULA_MAX_AREA = 720 * 1024
SLIDER_MIN_H, SLIDER_MAX_H = 256, 1024
SLIDER_MIN_W, SLIDER_MAX_W = 256, 1024
MAX_SEED = np.iinfo(np.int32).max
FIXED_FPS = 24
MIN_FRAMES_MODEL = 25
MAX_FRAMES_MODEL = 193
default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area, min_slider_h, max_slider_h, min_slider_w, max_slider_w, default_h, default_w):
orig_w, orig_h = pil_image.size
if orig_w <= 0 or orig_h <= 0:
return default_h, default_w
aspect_ratio = orig_h / orig_w
calc_h = round(np.sqrt(calculation_max_area * aspect_ratio))
calc_w = round(np.sqrt(calculation_max_area / aspect_ratio))
calc_h = max(mod_val, (calc_h // mod_val) * mod_val)
calc_w = max(mod_val, (calc_w // mod_val) * mod_val)
new_h = int(np.clip(calc_h, min_slider_h, (max_slider_h // mod_val) * mod_val))
new_w = int(np.clip(calc_w, min_slider_w, (max_slider_w // mod_val) * mod_val))
return new_h, new_w
def handle_image_upload_for_dims_wan(uploaded_pil_image, current_h_val, current_w_val):
if uploaded_pil_image is None:
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
try:
new_h, new_w = _calculate_new_dimensions_wan(
uploaded_pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
)
return gr.update(value=new_h), gr.update(value=new_w)
except Exception as e:
gr.Warning("Error attempting to calculate new dimensions")
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
def get_duration_video(input_image, prompt, height, width,
negative_prompt, duration_seconds,
guidance_scale, steps,
seed, randomize_seed,
progress):
return steps * 2 * duration_seconds
def get_duration_image(prompt, height, width, negative_prompt, guidance_scale, steps, seed, randomize_seed, progress):
return steps
@spaces.GPU(duration=get_duration_video)
def generate_video(prompt, height, width, input_image=None, negative_prompt=default_negative_prompt, duration_seconds=2, guidance_scale=0, steps=4, seed=44, randomize_seed=False, progress=gr.Progress(track_tqdm=True)):
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
if input_image is not None:
resized_image = input_image.resize((target_w, target_h))
with torch.inference_mode():
output_frames_list = image_to_video_pipe(
image=resized_image, prompt=prompt, negative_prompt=negative_prompt,
height=target_h, width=target_w, num_frames=num_frames,
guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed)
).frames[0]
else:
with torch.inference_mode():
output_frames_list = text_to_video_pipe(
prompt=prompt, negative_prompt=negative_prompt,
height=target_h, width=target_w, num_frames=num_frames,
guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed)
).frames[0]
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
video_path = tmpfile.name
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
return video_path, current_seed
@spaces.GPU(duration=get_duration_image)
def generate_image(prompt, height, width, negative_prompt=default_negative_prompt, guidance_scale=0, steps=4, seed=44, randomize_seed=False, progress=gr.Progress(track_tqdm=True)):
"""Generates a single image using the text-to-video pipeline by requesting only one frame."""
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
with torch.inference_mode():
output_frame = text_to_video_pipe(
prompt=prompt,
negative_prompt=negative_prompt,
height=target_h,
width=target_w,
num_frames=1,
guidance_scale=float(guidance_scale),
num_inference_steps=int(steps),
generator=torch.Generator(device="cuda").manual_seed(current_seed)
).frames[0][0]
return output_frame, current_seed
with gr.Blocks() as demo:
gr.Markdown("# Fast Wan 2.2 T2V I2V T2I 5B")
gr.Markdown("""This Demo is using [FastWan2.2-TI2V-5B](https://huggingface.co/FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers) which is fine-tuned with Sparse-distill method which allows wan to generate high quality videos in 3-5 steps.""")
with gr.Tabs():
with gr.TabItem("Text/Image-to-Video"):
with gr.Row():
with gr.Column():
input_image_component = gr.Image(type="pil", label="Input Image (optional, auto-resized to target H/W)")
prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
duration_seconds_input = gr.Slider(minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1), maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1), step=0.1, value=2, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
with gr.Accordion("Advanced Settings", open=False):
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
with gr.Row():
height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
steps_slider = gr.Slider(minimum=1, maximum=8, step=1, value=4, label="Inference Steps")
guidance_scale_input = gr.Slider(minimum=0.0, maximum=5.0, step=0.01, value=0.0, label="Guidance Scale")
generate_button = gr.Button("Generate Video", variant="primary")
with gr.Column():
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
input_image_component.upload(fn=handle_image_upload_for_dims_wan, inputs=[input_image_component, height_input, width_input], outputs=[height_input, width_input])
input_image_component.clear(fn=handle_image_upload_for_dims_wan, inputs=[input_image_component, height_input, width_input], outputs=[height_input, width_input])
ui_inputs_video = [prompt_input, height_input, width_input, input_image_component, negative_prompt_input, duration_seconds_input, guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox]
generate_button.click(fn=generate_video, inputs=ui_inputs_video, outputs=[video_output, seed_input])
with gr.TabItem("Text-to-Image"):
with gr.Row():
with gr.Column():
prompt_input_img = gr.Textbox(label="Prompt", value="An american man")
with gr.Accordion("Advanced Settings", open=False):
negative_prompt_input_img = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
seed_input_img = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
randomize_seed_checkbox_img = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
with gr.Row():
height_input_img = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
width_input_img = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
steps_slider_img = gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Inference Steps")
guidance_scale_input_img = gr.Slider(minimum=0.0, maximum=5.0, step=0.01, value=0.0, label="Guidance Scale")
generate_button_img = gr.Button("Generate Image", variant="primary")
with gr.Column():
image_output = gr.Image(label="Generated Image", interactive=False)
ui_inputs_img = [prompt_input_img, height_input_img, width_input_img, negative_prompt_input_img, guidance_scale_input_img, steps_slider_img, seed_input_img, randomize_seed_checkbox_img]
generate_button_img.click(fn=generate_image, inputs=ui_inputs_img, outputs=[image_output, seed_input_img])
if __name__ == "__main__":
demo.queue().launch() |