import gradio as gr import torch import torch.nn as nn import os import tempfile import shutil import imageio import pandas as pd import numpy as np from diffsynth import ModelManager, WanVideoReCamMasterPipeline, save_video import json from torchvision.transforms import v2 from einops import rearrange import torchvision from PIL import Image import logging from pathlib import Path from huggingface_hub import hf_hub_download logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Camera transformation types CAMERA_TRANSFORMATIONS = { "1": "Pan Right", "2": "Pan Left", "3": "Tilt Up", "4": "Tilt Down", "5": "Zoom In", "6": "Zoom Out", "7": "Translate Up (with rotation)", "8": "Translate Down (with rotation)", "9": "Arc Left (with rotation)", "10": "Arc Right (with rotation)" } # Global variables for model model_manager = None pipe = None is_model_loaded = False def download_recammaster_checkpoint(): """Download ReCamMaster checkpoint from HuggingFace using huggingface_hub""" # Define paths repo_id = "KwaiVGI/ReCamMaster-Wan2.1" filename = "step20000.ckpt" checkpoint_dir = Path("models/ReCamMaster/checkpoints") checkpoint_path = checkpoint_dir / filename # Check if already exists if checkpoint_path.exists(): logger.info(f"✓ ReCamMaster checkpoint already exists at {checkpoint_path}") return checkpoint_path # Create directory if it doesn't exist checkpoint_dir.mkdir(parents=True, exist_ok=True) # Download the checkpoint logger.info("Downloading ReCamMaster checkpoint from HuggingFace...") logger.info(f"Repository: {repo_id}") logger.info(f"File: {filename}") logger.info(f"Destination: {checkpoint_path}") try: # Download using huggingface_hub downloaded_path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir=checkpoint_dir, local_dir_use_symlinks=False ) logger.info(f"✓ Successfully downloaded ReCamMaster checkpoint to {downloaded_path}!") return downloaded_path except Exception as e: logger.error(f"✗ Error downloading checkpoint: {e}") raise class Camera(object): def __init__(self, c2w): c2w_mat = np.array(c2w).reshape(4, 4) self.c2w_mat = c2w_mat self.w2c_mat = np.linalg.inv(c2w_mat) def parse_matrix(matrix_str): """Parse camera matrix string from JSON format""" rows = matrix_str.strip().split('] [') matrix = [] for row in rows: row = row.replace('[', '').replace(']', '') matrix.append(list(map(float, row.split()))) return np.array(matrix) def get_relative_pose(cam_params): """Calculate relative camera poses""" abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params] abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params] cam_to_origin = 0 target_cam_c2w = np.array([ [1, 0, 0, 0], [0, 1, 0, -cam_to_origin], [0, 0, 1, 0], [0, 0, 0, 1] ]) abs2rel = target_cam_c2w @ abs_w2cs[0] ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]] ret_poses = np.array(ret_poses, dtype=np.float32) return ret_poses def load_models(progress_callback=None): """Load the ReCamMaster models""" global model_manager, pipe, is_model_loaded if is_model_loaded: return "Models already loaded!" try: logger.info("Starting model loading...") # First ensure the checkpoint is downloaded if progress_callback: progress_callback(0.05, desc="Checking for ReCamMaster checkpoint...") try: ckpt_path = download_recammaster_checkpoint() logger.info(f"Using checkpoint at {ckpt_path}") except Exception as e: error_msg = f"Error downloading ReCamMaster checkpoint: {str(e)}" logger.error(error_msg) return error_msg if progress_callback: progress_callback(0.1, desc="Loading model manager...") # Load Wan2.1 pre-trained models model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu") if progress_callback: progress_callback(0.3, desc="Loading Wan2.1 models...") model_manager.load_models([ "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors", "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth", "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth", ]) if progress_callback: progress_callback(0.5, desc="Creating pipeline...") pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda") if progress_callback: progress_callback(0.7, desc="Initializing ReCamMaster modules...") # Initialize additional modules introduced in ReCamMaster dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0] for block in pipe.dit.blocks: block.cam_encoder = nn.Linear(12, dim) block.projector = nn.Linear(dim, dim) block.cam_encoder.weight.data.zero_() block.cam_encoder.bias.data.zero_() block.projector.weight = nn.Parameter(torch.eye(dim)) block.projector.bias = nn.Parameter(torch.zeros(dim)) if progress_callback: progress_callback(0.9, desc="Loading ReCamMaster checkpoint...") # Load ReCamMaster checkpoint if not os.path.exists(ckpt_path): error_msg = f"Error: ReCamMaster checkpoint not found at {ckpt_path} even after download attempt." logger.error(error_msg) return error_msg state_dict = torch.load(ckpt_path, map_location="cpu") pipe.dit.load_state_dict(state_dict, strict=True) pipe.to("cuda") pipe.to(dtype=torch.bfloat16) is_model_loaded = True if progress_callback: progress_callback(1.0, desc="Models loaded successfully!") logger.info("Models loaded successfully!") return "Models loaded successfully!" except Exception as e: logger.error(f"Error loading models: {str(e)}") return f"Error loading models: {str(e)}" def extract_frames_from_video(video_path, output_dir, max_frames=81): """Extract frames from video and ensure we have at least 81 frames""" os.makedirs(output_dir, exist_ok=True) reader = imageio.get_reader(video_path) fps = reader.get_meta_data()['fps'] total_frames = reader.count_frames() frames = [] for i, frame in enumerate(reader): frames.append(frame) reader.close() # If we have fewer than required frames, repeat the last frame if len(frames) < max_frames: logger.info(f"Video has {len(frames)} frames, padding to {max_frames} frames") last_frame = frames[-1] while len(frames) < max_frames: frames.append(last_frame) # Save frames for i, frame in enumerate(frames[:max_frames]): frame_path = os.path.join(output_dir, f"frame_{i:04d}.png") imageio.imwrite(frame_path, frame) return len(frames[:max_frames]), fps def process_video_for_recammaster(video_path, text_prompt, cam_type, height=480, width=832): """Process video through ReCamMaster model""" global pipe # Create frame processor frame_process = v2.Compose([ v2.CenterCrop(size=(height, width)), v2.Resize(size=(height, width), antialias=True), v2.ToTensor(), v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ]) def crop_and_resize(image): width_img, height_img = image.size scale = max(width / width_img, height / height_img) image = torchvision.transforms.functional.resize( image, (round(height_img*scale), round(width_img*scale)), interpolation=torchvision.transforms.InterpolationMode.BILINEAR ) return image # Load video frames reader = imageio.get_reader(video_path) frames = [] for i in range(81): # ReCamMaster needs exactly 81 frames try: frame = reader.get_data(i) frame = Image.fromarray(frame) frame = crop_and_resize(frame) frame = frame_process(frame) frames.append(frame) except: # If we run out of frames, repeat the last one if frames: frames.append(frames[-1]) else: raise ValueError("Video is too short!") reader.close() frames = torch.stack(frames, dim=0) frames = rearrange(frames, "T C H W -> C T H W") video_tensor = frames.unsqueeze(0) # Add batch dimension # Load camera trajectory tgt_camera_path = "./example_test_data/cameras/camera_extrinsics.json" with open(tgt_camera_path, 'r') as file: cam_data = json.load(file) # Get camera trajectory for selected type cam_idx = list(range(81))[::4] # Sample every 4 frames traj = [parse_matrix(cam_data[f"frame{idx}"][f"cam{int(cam_type):02d}"]) for idx in cam_idx] traj = np.stack(traj).transpose(0, 2, 1) c2ws = [] for c2w in traj: c2w = c2w[:, [1, 2, 0, 3]] c2w[:3, 1] *= -1. c2w[:3, 3] /= 100 c2ws.append(c2w) tgt_cam_params = [Camera(cam_param) for cam_param in c2ws] relative_poses = [] for i in range(len(tgt_cam_params)): relative_pose = get_relative_pose([tgt_cam_params[0], tgt_cam_params[i]]) relative_poses.append(torch.as_tensor(relative_pose)[:,:3,:][1]) pose_embedding = torch.stack(relative_poses, dim=0) # 21x3x4 pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)') camera_tensor = pose_embedding.to(torch.bfloat16).unsqueeze(0) # Add batch dimension # Generate video with ReCamMaster video = pipe( prompt=[text_prompt], negative_prompt=["worst quality, low quality, blurry, jittery, distorted"], source_video=video_tensor, target_camera=camera_tensor, cfg_scale=5.0, num_inference_steps=50, seed=0, tiled=True ) return video def generate_recammaster_video( video_file, text_prompt, camera_type, progress=gr.Progress() ): """Main function to generate video with ReCamMaster""" global pipe, is_model_loaded if not is_model_loaded: return None, "Error: Models not loaded! Please load models first." if video_file is None: return None, "Please upload a video file." try: # Create temporary directory for processing with tempfile.TemporaryDirectory() as temp_dir: progress(0.1, desc="Processing input video...") # Copy uploaded video to temp directory input_video_path = os.path.join(temp_dir, "input.mp4") shutil.copy(video_file.name, input_video_path) # Extract frames progress(0.2, desc="Extracting video frames...") num_frames, fps = extract_frames_from_video(input_video_path, os.path.join(temp_dir, "frames")) logger.info(f"Extracted {num_frames} frames at {fps} fps") # Process with ReCamMaster progress(0.3, desc="Processing with ReCamMaster...") output_video = process_video_for_recammaster( input_video_path, text_prompt, camera_type ) # Save output video progress(0.9, desc="Saving output video...") output_path = os.path.join(temp_dir, "output.mp4") save_video(output_video, output_path, fps=30, quality=5) # Copy to persistent location final_output_path = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name shutil.copy(output_path, final_output_path) progress(1.0, desc="Done!") transformation_name = CAMERA_TRANSFORMATIONS.get(str(camera_type), "Unknown") status_msg = f"Successfully generated video with '{transformation_name}' camera movement!" return final_output_path, status_msg except Exception as e: logger.error(f"Error generating video: {str(e)}") return None, f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="ReCamMaster Demo") as demo: # Show loading status loading_status = gr.Textbox( label="Model Loading Status", value="Loading models, please wait...", interactive=False, visible=True ) gr.Markdown(""" # 🎥 ReCamMaster Demo ReCamMaster allows you to re-capture videos with novel camera trajectories. Upload a video and select a camera transformation to see the magic! **Note:** The ReCamMaster checkpoint will be automatically downloaded from HuggingFace when you start the app. You still need to download Wan2.1 models using `python download_wan2.1.py` before running this demo. """) with gr.Row(): with gr.Column(): # Video input section with gr.Group(): gr.Markdown("### Step 1: Upload Video") video_input = gr.Video(label="Input Video") text_prompt = gr.Textbox( label="Text Prompt (describe your video)", placeholder="A person walking in the street", value="A dynamic scene" ) # Camera selection with gr.Group(): gr.Markdown("### Step 2: Select Camera Movement") camera_type = gr.Radio( choices=[(v, k) for k, v in CAMERA_TRANSFORMATIONS.items()], label="Camera Transformation", value="1" ) # Generate button generate_btn = gr.Button("Generate Video", variant="primary") with gr.Column(): # Output section output_video = gr.Video(label="Output Video") status_output = gr.Textbox(label="Generation Status", interactive=False) # Example videos gr.Markdown("### Example Videos") gr.Examples( examples=[ ["example_test_data/videos/case0.mp4", "A person dancing", "1"], ["example_test_data/videos/case1.mp4", "A scenic view", "5"], ], inputs=[video_input, text_prompt, camera_type], ) # Load models automatically when the interface loads def on_load(): status = load_models() return gr.update(value=status, visible=True if "Error" in status else False) demo.load(on_load, outputs=[loading_status]) # Event handlers generate_btn.click( fn=generate_recammaster_video, inputs=[video_input, text_prompt, camera_type], outputs=[output_video, status_output] ) if __name__ == "__main__": demo.launch(share=True)