Spaces:

stabilityai
/

stable-virtual-camera

Running on L40S

App Files Files Community

stable-virtual-camera / demo_gr.py

jammmmm

Fix preprocess button

856c526 about 1 hour ago

raw

history blame contribute delete

50.6 kB

	import copy
	import json
	import os
	import os.path as osp
	import queue
	import secrets
	import threading
	import time
	from datetime import datetime
	from glob import glob
	from pathlib import Path
	from typing import Literal

	import gradio as gr
	import httpx
	import imageio.v3 as iio
	import numpy as np
	import torch
	import torch.nn.functional as F
	import tyro
	import viser
	import viser.transforms as vt
	from einops import rearrange
	from gradio import networking
	from gradio.context import LocalContext
	from gradio.tunneling import CERTIFICATE_PATH, Tunnel

	from seva.eval import (
	IS_TORCH_NIGHTLY,
	chunk_input_and_test,
	create_transforms_simple,
	infer_prior_stats,
	run_one_scene,
	transform_img_and_K,
	)
	from seva.geometry import (
	DEFAULT_FOV_RAD,
	get_default_intrinsics,
	get_preset_pose_fov,
	normalize_scene,
	)
	from seva.gui import define_gui
	from seva.model import SGMWrapper
	from seva.modules.autoencoder import AutoEncoder
	from seva.modules.conditioner import CLIPConditioner
	from seva.modules.preprocessor import Dust3rPipeline
	from seva.sampling import DDPMDiscretization, DiscreteDenoiser
	from seva.utils import load_model

	device = "cuda:0"


	# Constants.
	WORK_DIR = "work_dirs/demo_gr"
	MAX_SESSIONS = 1
	ADVANCE_EXAMPLE_MAP = [
	(
	"assets/advance/blue-car.jpg",
	["assets/advance/blue-car.jpg"],
	),
	(
	"assets/advance/garden-4_0.jpg",
	[
	"assets/advance/garden-4_0.jpg",
	"assets/advance/garden-4_1.jpg",
	"assets/advance/garden-4_2.jpg",
	"assets/advance/garden-4_3.jpg",
	],
	),
	(
	"assets/advance/vgg-lab-4_0.png",
	[
	"assets/advance/vgg-lab-4_0.png",
	"assets/advance/vgg-lab-4_1.png",
	"assets/advance/vgg-lab-4_2.png",
	"assets/advance/vgg-lab-4_3.png",
	],
	),
	(
	"assets/advance/telebooth-2_0.jpg",
	[
	"assets/advance/telebooth-2_0.jpg",
	"assets/advance/telebooth-2_1.jpg",
	],
	),
	(
	"assets/advance/backyard-7_0.jpg",
	[
	"assets/advance/backyard-7_0.jpg",
	"assets/advance/backyard-7_1.jpg",
	"assets/advance/backyard-7_2.jpg",
	"assets/advance/backyard-7_3.jpg",
	"assets/advance/backyard-7_4.jpg",
	"assets/advance/backyard-7_5.jpg",
	"assets/advance/backyard-7_6.jpg",
	],
	),
	]

	if IS_TORCH_NIGHTLY:
	COMPILE = True
	os.environ["TORCHINDUCTOR_AUTOGRAD_CACHE"] = "1"
	os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
	else:
	COMPILE = False

	# Shared global variables across sessions.
	DUST3R = Dust3rPipeline(device=device) # type: ignore
	MODEL = SGMWrapper(load_model(device="cpu", verbose=True).eval()).to(device)
	AE = AutoEncoder(chunk_size=1).to(device)
	CONDITIONER = CLIPConditioner().to(device)
	DISCRETIZATION = DDPMDiscretization()
	DENOISER = DiscreteDenoiser(discretization=DISCRETIZATION, num_idx=1000, device=device)
	VERSION_DICT = {
	"H": 576,
	"W": 576,
	"T": 21,
	"C": 4,
	"f": 8,
	"options": {},
	}
	SERVERS = {}
	ABORT_EVENTS = {}

	if COMPILE:
	MODEL = torch.compile(MODEL)
	CONDITIONER = torch.compile(CONDITIONER)
	AE = torch.compile(AE)


	class SevaRenderer(object):
	def __init__(self, server: viser.ViserServer):
	self.server = server
	self.gui_state = None

	def preprocess(
	self, input_img_path_or_tuples: list[tuple[str, None]] \| str
	) -> tuple[dict, dict, dict]:
	# Simply hardcode these such that aspect ratio is always kept and
	# shorter side is resized to 576. This is only to make GUI option fewer
	# though, changing it still works.
	shorter: int = 576
	# Has to be 64 multiple for the network.
	shorter = round(shorter / 64) * 64

	if isinstance(input_img_path_or_tuples, str):
	# Assume `Basic` demo mode: just hardcode the camera parameters and ignore points.
	input_imgs = torch.as_tensor(
	iio.imread(input_img_path_or_tuples) / 255.0, dtype=torch.float32
	)[None, ..., :3]
	input_imgs = transform_img_and_K(
	input_imgs.permute(0, 3, 1, 2),
	shorter,
	K=None,
	size_stride=64,
	)[0].permute(0, 2, 3, 1)
	input_Ks = get_default_intrinsics(
	aspect_ratio=input_imgs.shape[2] / input_imgs.shape[1]
	)
	input_c2ws = torch.eye(4)[None]
	# Simulate a small time interval such that gradio can update
	# propgress properly.
	time.sleep(0.1)
	return (
	{
	"input_imgs": input_imgs,
	"input_Ks": input_Ks,
	"input_c2ws": input_c2ws,
	"input_wh": (input_imgs.shape[2], input_imgs.shape[1]),
	"points": [np.zeros((0, 3))],
	"point_colors": [np.zeros((0, 3))],
	"scene_scale": 1.0,
	},
	gr.update(visible=False),
	gr.update(),
	)
	else:
	# Assume `Advance` demo mode: use dust3r to extract camera parameters and points.
	img_paths = [p for (p, _) in input_img_path_or_tuples]
	(
	input_imgs,
	input_Ks,
	input_c2ws,
	points,
	point_colors,
	) = DUST3R.infer_cameras_and_points(img_paths)
	num_inputs = len(img_paths)
	if num_inputs == 1:
	input_imgs, input_Ks, input_c2ws, points, point_colors = (
	input_imgs[:1],
	input_Ks[:1],
	input_c2ws[:1],
	points[:1],
	point_colors[:1],
	)
	input_imgs = [img[..., :3] for img in input_imgs]
	# Normalize the scene.
	point_chunks = [p.shape[0] for p in points]
	point_indices = np.cumsum(point_chunks)[:-1]
	input_c2ws, points, _ = normalize_scene( # type: ignore
	input_c2ws,
	np.concatenate(points, 0),
	camera_center_method="poses",
	)
	points = np.split(points, point_indices, 0)
	# Scale camera and points for viewport visualization.
	scene_scale = np.median(
	np.ptp(np.concatenate([input_c2ws[:, :3, 3], *points], 0), -1)
	)
	input_c2ws[:, :3, 3] /= scene_scale
	points = [point / scene_scale for point in points]
	input_imgs = [
	torch.as_tensor(img / 255.0, dtype=torch.float32) for img in input_imgs
	]
	input_Ks = torch.as_tensor(input_Ks)
	input_c2ws = torch.as_tensor(input_c2ws)
	new_input_imgs, new_input_Ks = [], []
	for img, K in zip(input_imgs, input_Ks):
	img = rearrange(img, "h w c -> 1 c h w")
	# If you don't want to keep aspect ratio and want to always center crop, use this:
	# img, K = transform_img_and_K(img, (shorter, shorter), K=K[None])
	img, K = transform_img_and_K(img, shorter, K=K[None], size_stride=64)
	assert isinstance(K, torch.Tensor)
	K = K / K.new_tensor([img.shape[-1], img.shape[-2], 1])[:, None]
	new_input_imgs.append(img)
	new_input_Ks.append(K)
	input_imgs = torch.cat(new_input_imgs, 0)
	input_imgs = rearrange(input_imgs, "b c h w -> b h w c")[..., :3]
	input_Ks = torch.cat(new_input_Ks, 0)
	return (
	{
	"input_imgs": input_imgs,
	"input_Ks": input_Ks,
	"input_c2ws": input_c2ws,
	"input_wh": (input_imgs.shape[2], input_imgs.shape[1]),
	"points": points,
	"point_colors": point_colors,
	"scene_scale": scene_scale,
	},
	gr.update(visible=False),
	gr.update()
	if num_inputs <= 10
	else gr.update(choices=["interp"], value="interp"),
	)

	def visualize_scene(self, preprocessed: dict):
	server = self.server
	server.scene.reset()
	server.gui.reset()
	set_bkgd_color(server)

	(
	input_imgs,
	input_Ks,
	input_c2ws,
	input_wh,
	points,
	point_colors,
	scene_scale,
	) = (
	preprocessed["input_imgs"],
	preprocessed["input_Ks"],
	preprocessed["input_c2ws"],
	preprocessed["input_wh"],
	preprocessed["points"],
	preprocessed["point_colors"],
	preprocessed["scene_scale"],
	)
	W, H = input_wh

	server.scene.set_up_direction(-input_c2ws[..., :3, 1].mean(0).numpy())

	# Use first image as default fov.
	assert input_imgs[0].shape[:2] == (H, W)
	if H > W:
	init_fov = 2 * np.arctan(1 / (2 * input_Ks[0, 0, 0].item()))
	else:
	init_fov = 2 * np.arctan(1 / (2 * input_Ks[0, 1, 1].item()))
	init_fov_deg = float(init_fov / np.pi * 180.0)

	frustum_nodes, pcd_nodes = [], []
	for i in range(len(input_imgs)):
	K = input_Ks[i]
	frustum = server.scene.add_camera_frustum(
	f"/scene_assets/cameras/{i}",
	fov=2 * np.arctan(1 / (2 * K[1, 1].item())),
	aspect=W / H,
	scale=0.1 * scene_scale,
	image=(input_imgs[i].numpy() * 255.0).astype(np.uint8),
	wxyz=vt.SO3.from_matrix(input_c2ws[i, :3, :3].numpy()).wxyz,
	position=input_c2ws[i, :3, 3].numpy(),
	)

	def get_handler(frustum):
	def handler(event: viser.GuiEvent) -> None:
	assert event.client_id is not None
	client = server.get_clients()[event.client_id]
	with client.atomic():
	client.camera.position = frustum.position
	client.camera.wxyz = frustum.wxyz
	# Set look_at as the projected origin onto the
	# frustum's forward direction.
	look_direction = vt.SO3(frustum.wxyz).as_matrix()[:, 2]
	position_origin = -frustum.position
	client.camera.look_at = (
	frustum.position
	+ np.dot(look_direction, position_origin)
	/ np.linalg.norm(position_origin)
	* look_direction
	)

	return handler

	frustum.on_click(get_handler(frustum)) # type: ignore
	frustum_nodes.append(frustum)

	pcd = server.scene.add_point_cloud(
	f"/scene_assets/points/{i}",
	points[i],
	point_colors[i],
	point_size=0.01 * scene_scale,
	point_shape="circle",
	)
	pcd_nodes.append(pcd)

	with server.gui.add_folder("Scene scale", expand_by_default=False, order=200):
	camera_scale_slider = server.gui.add_slider(
	"Log camera scale", initial_value=0.0, min=-2.0, max=2.0, step=0.1
	)

	@camera_scale_slider.on_update
	def _(_) -> None:
	for i in range(len(frustum_nodes)):
	frustum_nodes[i].scale = (
	0.1 * scene_scale * 10**camera_scale_slider.value
	)

	point_scale_slider = server.gui.add_slider(
	"Log point scale", initial_value=0.0, min=-2.0, max=2.0, step=0.1
	)

	@point_scale_slider.on_update
	def _(_) -> None:
	for i in range(len(pcd_nodes)):
	pcd_nodes[i].point_size = (
	0.01 * scene_scale * 10**point_scale_slider.value
	)

	self.gui_state = define_gui(
	server,
	init_fov=init_fov_deg,
	img_wh=input_wh,
	scene_scale=scene_scale,
	)

	def get_target_c2ws_and_Ks_from_gui(self, preprocessed: dict):
	input_wh = preprocessed["input_wh"]
	W, H = input_wh
	gui_state = self.gui_state
	assert gui_state is not None and gui_state.camera_traj_list is not None
	target_c2ws, target_Ks = [], []
	for item in gui_state.camera_traj_list:
	target_c2ws.append(item["w2c"])
	assert item["img_wh"] == input_wh
	K = np.array(item["K"]).reshape(3, 3) / np.array([W, H, 1])[:, None]
	target_Ks.append(K)
	target_c2ws = torch.as_tensor(
	np.linalg.inv(np.array(target_c2ws).reshape(-1, 4, 4))
	)
	target_Ks = torch.as_tensor(np.array(target_Ks).reshape(-1, 3, 3))
	return target_c2ws, target_Ks

	def get_target_c2ws_and_Ks_from_preset(
	self,
	preprocessed: dict,
	preset_traj: Literal[
	"orbit",
	"spiral",
	"lemniscate",
	"zoom-in",
	"zoom-out",
	"dolly zoom-in",
	"dolly zoom-out",
	"move-forward",
	"move-backward",
	"move-up",
	"move-down",
	"move-left",
	"move-right",
	],
	num_frames: int,
	zoom_factor: float \| None,
	):
	img_wh = preprocessed["input_wh"]
	start_c2w = preprocessed["input_c2ws"][0]
	start_w2c = torch.linalg.inv(start_c2w)
	look_at = torch.tensor([0, 0, 10])
	start_fov = DEFAULT_FOV_RAD
	target_c2ws, target_fovs = get_preset_pose_fov(
	preset_traj,
	num_frames,
	start_w2c,
	look_at,
	-start_c2w[:3, 1],
	start_fov,
	spiral_radii=[1.0, 1.0, 0.5],
	zoom_factor=zoom_factor,
	)
	target_c2ws = torch.as_tensor(target_c2ws)
	target_fovs = torch.as_tensor(target_fovs)
	target_Ks = get_default_intrinsics(
	target_fovs, # type: ignore
	aspect_ratio=img_wh[0] / img_wh[1],
	)
	return target_c2ws, target_Ks

	def export_output_data(self, preprocessed: dict, output_dir: str):
	input_imgs, input_Ks, input_c2ws, input_wh = (
	preprocessed["input_imgs"],
	preprocessed["input_Ks"],
	preprocessed["input_c2ws"],
	preprocessed["input_wh"],
	)
	target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_gui(preprocessed)

	num_inputs = len(input_imgs)
	num_targets = len(target_c2ws)

	input_imgs = (input_imgs.cpu().numpy() * 255.0).astype(np.uint8)
	input_c2ws = input_c2ws.cpu().numpy()
	input_Ks = input_Ks.cpu().numpy()
	target_c2ws = target_c2ws.cpu().numpy()
	target_Ks = target_Ks.cpu().numpy()
	img_whs = np.array(input_wh)[None].repeat(len(input_imgs) + len(target_Ks), 0)

	os.makedirs(output_dir, exist_ok=True)
	img_paths = []
	for i, img in enumerate(input_imgs):
	iio.imwrite(img_path := osp.join(output_dir, f"{i:03d}.png"), img)
	img_paths.append(img_path)
	for i in range(num_targets):
	iio.imwrite(
	img_path := osp.join(output_dir, f"{i + num_inputs:03d}.png"),
	np.zeros((input_wh[1], input_wh[0], 3), dtype=np.uint8),
	)
	img_paths.append(img_path)

	# Convert from OpenCV to OpenGL camera format.
	all_c2ws = np.concatenate([input_c2ws, target_c2ws])
	all_Ks = np.concatenate([input_Ks, target_Ks])
	all_c2ws = all_c2ws @ np.diag([1, -1, -1, 1])
	create_transforms_simple(output_dir, img_paths, img_whs, all_c2ws, all_Ks)
	split_dict = {
	"train_ids": list(range(num_inputs)),
	"test_ids": list(range(num_inputs, num_inputs + num_targets)),
	}
	with open(
	osp.join(output_dir, f"train_test_split_{num_inputs}.json"), "w"
	) as f:
	json.dump(split_dict, f, indent=4)
	gr.Info(f"Output data saved to {output_dir}", duration=1)

	def render(
	self,
	preprocessed: dict,
	session_hash: str,
	seed: int,
	chunk_strategy: str,
	cfg: float,
	preset_traj: Literal[
	"orbit",
	"spiral",
	"lemniscate",
	"zoom-in",
	"zoom-out",
	"dolly zoom-in",
	"dolly zoom-out",
	"move-forward",
	"move-backward",
	"move-up",
	"move-down",
	"move-left",
	"move-right",
	]
	\| None,
	num_frames: int \| None,
	zoom_factor: float \| None,
	camera_scale: float,
	):
	render_name = datetime.now().strftime("%Y%m%d_%H%M%S")
	render_dir = osp.join(WORK_DIR, render_name)

	input_imgs, input_Ks, input_c2ws, (W, H) = (
	preprocessed["input_imgs"],
	preprocessed["input_Ks"],
	preprocessed["input_c2ws"],
	preprocessed["input_wh"],
	)
	num_inputs = len(input_imgs)
	if preset_traj is None:
	target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_gui(preprocessed)
	else:
	assert num_frames is not None
	assert num_inputs == 1
	input_c2ws = torch.eye(4)[None].to(dtype=input_c2ws.dtype)
	target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_preset(
	preprocessed, preset_traj, num_frames, zoom_factor
	)
	all_c2ws = torch.cat([input_c2ws, target_c2ws], 0)
	all_Ks = (
	torch.cat([input_Ks, target_Ks], 0)
	* input_Ks.new_tensor([W, H, 1])[:, None]
	)
	num_targets = len(target_c2ws)
	input_indices = list(range(num_inputs))
	target_indices = np.arange(num_inputs, num_inputs + num_targets).tolist()
	# Get anchor cameras.
	T = VERSION_DICT["T"]
	version_dict = copy.deepcopy(VERSION_DICT)
	num_anchors = infer_prior_stats(
	T,
	num_inputs,
	num_total_frames=num_targets,
	version_dict=version_dict,
	)
	# infer_prior_stats modifies T in-place.
	T = version_dict["T"]
	assert isinstance(num_anchors, int)
	anchor_indices = np.linspace(
	num_inputs,
	num_inputs + num_targets - 1,
	num_anchors,
	).tolist()
	anchor_c2ws = all_c2ws[[round(ind) for ind in anchor_indices]]
	anchor_Ks = all_Ks[[round(ind) for ind in anchor_indices]]
	# Create image conditioning.
	all_imgs_np = (
	F.pad(input_imgs, (0, 0, 0, 0, 0, 0, 0, num_targets), value=0.0).numpy()
	* 255.0
	).astype(np.uint8)
	image_cond = {
	"img": all_imgs_np,
	"input_indices": input_indices,
	"prior_indices": anchor_indices,
	}
	# Create camera conditioning (K is unnormalized).
	camera_cond = {
	"c2w": all_c2ws,
	"K": all_Ks,
	"input_indices": list(range(num_inputs + num_targets)),
	}
	# Run rendering.
	num_steps = 50
	options_ori = VERSION_DICT["options"]
	options = copy.deepcopy(options_ori)
	options["chunk_strategy"] = chunk_strategy
	options["video_save_fps"] = 30.0
	options["beta_linear_start"] = 5e-6
	options["log_snr_shift"] = 2.4
	options["guider_types"] = [1, 2]
	options["cfg"] = [
	float(cfg),
	3.0 if num_inputs >= 9 else 2.0,
	] # We define semi-dense-view regime to have 9 input views.
	options["camera_scale"] = camera_scale
	options["num_steps"] = num_steps
	options["cfg_min"] = 1.2
	options["encoding_t"] = 1
	options["decoding_t"] = 1
	assert session_hash in ABORT_EVENTS
	abort_event = ABORT_EVENTS[session_hash]
	abort_event.clear()
	options["abort_event"] = abort_event
	task = "img2trajvid"
	# Get number of first pass chunks.
	T_first_pass = T[0] if isinstance(T, (list, tuple)) else T
	chunk_strategy_first_pass = options.get(
	"chunk_strategy_first_pass", "gt-nearest"
	)
	num_chunks_0 = len(
	chunk_input_and_test(
	T_first_pass,
	input_c2ws,
	anchor_c2ws,
	input_indices,
	image_cond["prior_indices"],
	options={**options, "sampler_verbose": False},
	task=task,
	chunk_strategy=chunk_strategy_first_pass,
	gt_input_inds=list(range(input_c2ws.shape[0])),
	)[1]
	)
	# Get number of second pass chunks.
	anchor_argsort = np.argsort(input_indices + anchor_indices).tolist()
	anchor_indices = np.array(input_indices + anchor_indices)[
	anchor_argsort
	].tolist()
	gt_input_inds = [anchor_argsort.index(i) for i in range(input_c2ws.shape[0])]
	anchor_c2ws_second_pass = torch.cat([input_c2ws, anchor_c2ws], dim=0)[
	anchor_argsort
	]
	T_second_pass = T[1] if isinstance(T, (list, tuple)) else T
	chunk_strategy = options.get("chunk_strategy", "nearest")
	num_chunks_1 = len(
	chunk_input_and_test(
	T_second_pass,
	anchor_c2ws_second_pass,
	target_c2ws,
	anchor_indices,
	target_indices,
	options={**options, "sampler_verbose": False},
	task=task,
	chunk_strategy=chunk_strategy,
	gt_input_inds=gt_input_inds,
	)[1]
	)
	second_pass_pbar = gr.Progress().tqdm(
	iterable=None,
	desc="Second pass sampling",
	total=num_chunks_1 * num_steps,
	)
	first_pass_pbar = gr.Progress().tqdm(
	iterable=None,
	desc="First pass sampling",
	total=num_chunks_0 * num_steps,
	)
	video_path_generator = run_one_scene(
	task=task,
	version_dict={
	"H": H,
	"W": W,
	"T": T,
	"C": VERSION_DICT["C"],
	"f": VERSION_DICT["f"],
	"options": options,
	},
	model=MODEL,
	ae=AE,
	conditioner=CONDITIONER,
	denoiser=DENOISER,
	image_cond=image_cond,
	camera_cond=camera_cond,
	save_path=render_dir,
	use_traj_prior=True,
	traj_prior_c2ws=anchor_c2ws,
	traj_prior_Ks=anchor_Ks,
	seed=seed,
	gradio=True,
	first_pass_pbar=first_pass_pbar,
	second_pass_pbar=second_pass_pbar,
	abort_event=abort_event,
	)
	output_queue = queue.Queue()

	blocks = LocalContext.blocks.get()
	event_id = LocalContext.event_id.get()

	def worker():
	# gradio doesn't support threading with progress intentionally, so
	# we need to hack this.
	LocalContext.blocks.set(blocks)
	LocalContext.event_id.set(event_id)
	for i, video_path in enumerate(video_path_generator):
	if i == 0:
	output_queue.put(
	(
	video_path,
	gr.update(),
	gr.update(),
	gr.update(),
	)
	)
	elif i == 1:
	output_queue.put(
	(
	video_path,
	gr.update(visible=True),
	gr.update(visible=False),
	gr.update(visible=False),
	)
	)
	else:
	gr.Error("More than two passes during rendering.")

	thread = threading.Thread(target=worker, daemon=True)
	thread.start()

	while thread.is_alive() or not output_queue.empty():
	if abort_event.is_set():
	thread.join()
	abort_event.clear()
	yield (
	gr.update(),
	gr.update(visible=True),
	gr.update(visible=False),
	gr.update(visible=False),
	)
	time.sleep(0.1)
	while not output_queue.empty():
	yield output_queue.get()


	# This is basically a copy of the original `networking.setup_tunnel` function,
	# but it also returns the tunnel object for proper cleanup.
	def setup_tunnel(
	local_host: str, local_port: int, share_token: str, share_server_address: str \| None
	) -> tuple[str, Tunnel]:
	share_server_address = (
	networking.GRADIO_SHARE_SERVER_ADDRESS
	if share_server_address is None
	else share_server_address
	)
	if share_server_address is None:
	try:
	response = httpx.get(networking.GRADIO_API_SERVER, timeout=30)
	payload = response.json()[0]
	remote_host, remote_port = payload["host"], int(payload["port"])
	certificate = payload["root_ca"]
	Path(CERTIFICATE_PATH).parent.mkdir(parents=True, exist_ok=True)
	with open(CERTIFICATE_PATH, "w") as f:
	f.write(certificate)
	except Exception as e:
	raise RuntimeError(
	"Could not get share link from Gradio API Server."
	) from e
	else:
	remote_host, remote_port = share_server_address.split(":")
	remote_port = int(remote_port)
	tunnel = Tunnel(remote_host, remote_port, local_host, local_port, share_token)
	address = tunnel.start_tunnel()
	return address, tunnel


	def set_bkgd_color(server: viser.ViserServer \| viser.ClientHandle):
	server.scene.set_background_image(np.array([[[39, 39, 42]]], dtype=np.uint8))


	def start_server_and_abort_event(request: gr.Request):
	server = viser.ViserServer()

	@server.on_client_connect
	def _(client: viser.ClientHandle):
	# Force dark mode that blends well with gradio's dark theme.
	client.gui.configure_theme(
	dark_mode=True,
	show_share_button=False,
	control_layout="collapsible",
	)
	set_bkgd_color(client)

	print(f"Starting server {server.get_port()}")
	server_url, tunnel = setup_tunnel(
	local_host=server.get_host(),
	local_port=server.get_port(),
	share_token=secrets.token_urlsafe(32),
	share_server_address=None,
	)
	SERVERS[request.session_hash] = (server, tunnel)
	if server_url is None:
	raise gr.Error(
	"Failed to get a viewport URL. Please check your network connection."
	)
	# Give it enough time to start.
	time.sleep(1)

	ABORT_EVENTS[request.session_hash] = threading.Event()

	return (
	SevaRenderer(server),
	gr.HTML(
	f'<iframe src="{server_url}" style="display: block; margin: auto; width: 100%; height: min(60vh, 600px);" frameborder="0"></iframe>',
	container=True,
	),
	request.session_hash,
	)


	def stop_server_and_abort_event(request: gr.Request):
	if request.session_hash in SERVERS:
	print(f"Stopping server {request.session_hash}")
	server, tunnel = SERVERS.pop(request.session_hash)
	server.stop()
	tunnel.kill()

	if request.session_hash in ABORT_EVENTS:
	print(f"Setting abort event {request.session_hash}")
	ABORT_EVENTS[request.session_hash].set()
	# Give it enough time to abort jobs.
	time.sleep(5)
	ABORT_EVENTS.pop(request.session_hash)


	def set_abort_event(request: gr.Request):
	if request.session_hash in ABORT_EVENTS:
	print(f"Setting abort event {request.session_hash}")
	ABORT_EVENTS[request.session_hash].set()


	def get_advance_examples(selection: gr.SelectData):
	index = selection.index
	return (
	gr.Gallery(ADVANCE_EXAMPLE_MAP[index][1], visible=True),
	gr.update(visible=True),
	gr.update(visible=True),
	gr.Gallery(visible=False),
	)


	def get_preamble():
	gr.Markdown("""
	# Stable Virtual Camera
	<span style="display: flex; flex-wrap: wrap; gap: 5px;">
	<a href="https://stable-virtual-camera.github.io"><img src="https://img.shields.io/badge/%F0%9F%8F%A0%20Project%20Page-gray.svg"></a>
	<a href="http://arxiv.org/abs/2503.14489"><img src="https://img.shields.io/badge/%F0%9F%93%84%20arXiv-2503.14489-B31B1B.svg"></a>
	<a href="https://stability.ai/news/introducing-stable-virtual-camera-multi-view-video-generation-with-3d-camera-control"><img src="https://img.shields.io/badge/%F0%9F%93%83%20Blog-Stability%20AI-orange.svg"></a>
	<a href="https://huggingface.co/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange"></a>
	<a href="https://huggingface.co/spaces/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%9A%80%20Gradio%20Demo-Huggingface-orange"></a>
	<a href="https://www.youtube.com/channel/UCLLlVDcS7nNenT_zzO3OPxQ"><img src="https://img.shields.io/badge/%F0%9F%8E%AC%20Video-YouTube-orange"></a>
	</span>

	Welcome to the demo of <strong>Stable Virtual Camera (Seva)</strong>! Given any number of input views and their cameras, this demo will allow you to generate novel views of a scene at any target camera of interest.

	We provide two ways to use our demo (selected by the tab below, documented [here](https://github.com/Stability-AI/stable-virtual-camera/blob/main/docs/GR_USAGE.md)):
	1. [Basic](https://github.com/user-attachments/assets/4d965fa6-d8eb-452c-b773-6e09c88ca705): Given a single image, you can generate a video following one of our preset camera trajectories.
	2. [Advanced](https://github.com/user-attachments/assets/dcec1be0-bd10-441e-879c-d1c2b63091ba): Given any number of input images, you can generate a video following any camera trajectory of your choice by our key-frame-based interface.

	> This is a research preview and comes with a few [limitations](https://stable-virtual-camera.github.io/#limitations):
	> - Limited quality in certain subjects due to training data, including humans, animals, and dynamic textures.
	> - Limited quality in some highly ambiguous scenes and camera trajectories, including extreme views and collision into objects.
	""")


	# Make sure that gradio uses dark theme.
	_APP_JS = """
	function refresh() {
	const url = new URL(window.location);
	if (url.searchParams.get('__theme') !== 'dark') {
	url.searchParams.set('__theme', 'dark');
	}
	}
	"""


	def main(server_port: int \| None = None, share: bool = True):
	with gr.Blocks(js=_APP_JS) as app:
	renderer = gr.State()
	session_hash = gr.State()
	_ = get_preamble()
	with gr.Tabs():
	with gr.Tab("Basic"):
	render_btn = gr.Button("Render video", interactive=False, render=False)
	with gr.Row():
	with gr.Column():
	with gr.Group():
	# Initially disable the Preprocess Images button until an image is selected.
	preprocess_btn = gr.Button("Preprocess images", interactive=False)
	preprocess_progress = gr.Textbox(
	label="",
	visible=False,
	interactive=False,
	)
	with gr.Group():
	input_imgs = gr.Image(
	type="filepath",
	label="Input",
	height=200,
	)
	_ = gr.Examples(
	examples=sorted(glob("assets/basic/*")),
	inputs=[input_imgs],
	label="Example",
	)
	chunk_strategy = gr.Dropdown(
	["interp", "interp-gt"],
	label="Chunk strategy",
	render=False,
	)
	preprocessed = gr.State()
	# Enable the Preprocess Images button only if an image is selected.
	input_imgs.change(
	lambda img: gr.update(interactive=bool(img)),
	inputs=input_imgs,
	outputs=preprocess_btn,
	)
	preprocess_btn.click(
	lambda r, *args: [
	r.preprocess(args),
	gr.update(interactive=True),
	],
	inputs=[renderer, input_imgs],
	outputs=[
	preprocessed,
	preprocess_progress,
	chunk_strategy,
	render_btn,
	],
	show_progress_on=[preprocess_progress],
	concurrency_limit=1,
	concurrency_id="gpu_queue",
	)
	preprocess_btn.click(
	lambda: gr.update(visible=True),
	outputs=[preprocess_progress],
	)
	with gr.Row():
	preset_traj = gr.Dropdown(
	choices=[
	"orbit",
	"spiral",
	"lemniscate",
	"zoom-in",
	"zoom-out",
	"dolly zoom-in",
	"dolly zoom-out",
	"move-forward",
	"move-backward",
	"move-up",
	"move-down",
	"move-left",
	"move-right",
	],
	label="Preset trajectory",
	value="orbit",
	)
	num_frames = gr.Slider(30, 150, 80, label="#Frames")
	zoom_factor = gr.Slider(
	step=0.01, label="Zoom factor", visible=False
	)
	with gr.Row():
	seed = gr.Number(value=23, label="Random seed")
	chunk_strategy.render()
	cfg = gr.Slider(1.0, 7.0, value=4.0, label="CFG value")
	with gr.Row():
	camera_scale = gr.Slider(
	0.1,
	15.0,
	value=2.0,
	label="Camera scale",
	)

	def default_cfg_preset_traj(traj):
	# These are just some hand-tuned values that we
	# found work the best.
	if traj in ["zoom-out", "move-down"]:
	value = 5.0
	elif traj in [
	"orbit",
	"dolly zoom-out",
	"move-backward",
	"move-up",
	"move-left",
	"move-right",
	]:
	value = 4.0
	else:
	value = 3.0
	return value

	preset_traj.change(
	default_cfg_preset_traj,
	inputs=[preset_traj],
	outputs=[cfg],
	)
	preset_traj.change(
	lambda traj: gr.update(
	value=(
	10.0 if "dolly" in traj or "pan" in traj else 2.0
	)
	),
	inputs=[preset_traj],
	outputs=[camera_scale],
	)

	def zoom_factor_preset_traj(traj):
	visible = traj in [
	"zoom-in",
	"zoom-out",
	"dolly zoom-in",
	"dolly zoom-out",
	]
	is_zoomin = traj.endswith("zoom-in")
	if is_zoomin:
	minimum = 0.1
	maximum = 0.5
	value = 0.28
	else:
	minimum = 1.2
	maximum = 3
	value = 1.5
	return gr.update(
	visible=visible,
	minimum=minimum,
	maximum=maximum,
	value=value,
	)

	preset_traj.change(
	zoom_factor_preset_traj,
	inputs=[preset_traj],
	outputs=[zoom_factor],
	)
	with gr.Column():
	with gr.Group():
	abort_btn = gr.Button("Abort rendering", visible=False)
	render_btn.render()
	render_progress = gr.Textbox(
	label="", visible=False, interactive=False
	)
	output_video = gr.Video(
	label="Output", interactive=False, autoplay=True, loop=True
	)
	render_btn.click(
	lambda r, args: (yield from r.render(args)),
	inputs=[
	renderer,
	preprocessed,
	session_hash,
	seed,
	chunk_strategy,
	cfg,
	preset_traj,
	num_frames,
	zoom_factor,
	camera_scale,
	],
	outputs=[
	output_video,
	render_btn,
	abort_btn,
	render_progress,
	],
	show_progress_on=[render_progress],
	concurrency_id="gpu_queue",
	)
	render_btn.click(
	lambda: [
	gr.update(visible=False),
	gr.update(visible=True),
	gr.update(visible=True),
	],
	outputs=[render_btn, abort_btn, render_progress],
	)
	abort_btn.click(set_abort_event)
	with gr.Tab("Advanced"):
	render_btn = gr.Button("Render video", interactive=False, render=False)
	viewport = gr.HTML(container=True, render=False)
	gr.Timer(0.1).tick(
	lambda renderer: gr.update(
	interactive=renderer is not None
	and renderer.gui_state is not None
	and renderer.gui_state.camera_traj_list is not None
	),
	inputs=[renderer],
	outputs=[render_btn],
	)
	with gr.Row():
	viewport.render()
	with gr.Row():
	with gr.Column():
	with gr.Group():
	# Initially disable the Preprocess Images button until images are selected.
	preprocess_btn = gr.Button("Preprocess images", interactive=False)
	preprocess_progress = gr.Textbox(
	label="",
	visible=False,
	interactive=False,
	)
	with gr.Group():
	input_imgs = gr.Gallery(
	interactive=True,
	label="Input",
	columns=4,
	height=200,
	)
	# Define example images (gradio doesn't support variable length
	# examples so we need to hack it).
	example_imgs = gr.Gallery(
	[e[0] for e in ADVANCE_EXAMPLE_MAP],
	allow_preview=False,
	preview=False,
	label="Example",
	columns=20,
	rows=1,
	height=115,
	)
	example_imgs_expander = gr.Gallery(
	visible=False,
	interactive=False,
	label="Example",
	preview=True,
	columns=20,
	rows=1,
	)
	chunk_strategy = gr.Dropdown(
	["interp-gt", "interp"],
	label="Chunk strategy",
	value="interp-gt",
	render=False,
	)
	with gr.Row():
	example_imgs_backer = gr.Button(
	"Go back", visible=False
	)
	example_imgs_confirmer = gr.Button(
	"Confirm", visible=False
	)
	example_imgs.select(
	get_advance_examples,
	outputs=[
	example_imgs_expander,
	example_imgs_confirmer,
	example_imgs_backer,
	example_imgs,
	],
	)
	example_imgs_confirmer.click(
	lambda x: (
	x,
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=True),
	gr.update(interactive=bool(x))
	),
	inputs=[example_imgs_expander],
	outputs=[
	input_imgs,
	example_imgs_expander,
	example_imgs_confirmer,
	example_imgs_backer,
	example_imgs,
	preprocess_btn
	],
	)
	example_imgs_backer.click(
	lambda: (
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=True),
	),
	outputs=[
	example_imgs_expander,
	example_imgs_confirmer,
	example_imgs_backer,
	example_imgs,
	],
	)
	preprocessed = gr.State()
	preprocess_btn.click(
	lambda r, args: r.preprocess(args),
	inputs=[renderer, input_imgs],
	outputs=[
	preprocessed,
	preprocess_progress,
	chunk_strategy,
	],
	show_progress_on=[preprocess_progress],
	concurrency_id="gpu_queue",
	)
	preprocess_btn.click(
	lambda: gr.update(visible=True),
	outputs=[preprocess_progress],
	)
	preprocessed.change(
	lambda r, args: r.visualize_scene(args),
	inputs=[renderer, preprocessed],
	)
	with gr.Row():
	seed = gr.Number(value=23, label="Random seed")
	chunk_strategy.render()
	cfg = gr.Slider(1.0, 7.0, value=3.0, label="CFG value")
	with gr.Row():
	camera_scale = gr.Slider(
	0.1,
	15.0,
	value=2.0,
	label="Camera scale (useful for single-view input)",
	)
	with gr.Group():
	output_data_dir = gr.Textbox(label="Output data directory")
	output_data_btn = gr.Button("Export output data")
	output_data_btn.click(
	lambda r, args: r.export_output_data(args),
	inputs=[renderer, preprocessed, output_data_dir],
	)
	with gr.Column():
	with gr.Group():
	abort_btn = gr.Button("Abort rendering", visible=False)
	render_btn.render()
	render_progress = gr.Textbox(
	label="", visible=False, interactive=False
	)
	output_video = gr.Video(
	label="Output", interactive=False, autoplay=True, loop=True
	)
	render_btn.click(
	lambda r, args: (yield from r.render(args)),
	inputs=[
	renderer,
	preprocessed,
	session_hash,
	seed,
	chunk_strategy,
	cfg,
	gr.State(),
	gr.State(),
	gr.State(),
	camera_scale,
	],
	outputs=[
	output_video,
	render_btn,
	abort_btn,
	render_progress,
	],
	show_progress_on=[render_progress],
	concurrency_id="gpu_queue",
	)
	render_btn.click(
	lambda: [
	gr.update(visible=False),
	gr.update(visible=True),
	gr.update(visible=True),
	],
	outputs=[render_btn, abort_btn, render_progress],
	)
	abort_btn.click(set_abort_event)

	# Register the session initialization and cleanup functions.
	app.load(
	start_server_and_abort_event,
	outputs=[renderer, viewport, session_hash],
	)
	app.unload(stop_server_and_abort_event)

	app.queue(max_size=5).launch(
	share=share,
	server_port=server_port,
	show_error=True,
	allowed_paths=[WORK_DIR],
	# Badget rendering will be broken otherwise.
	ssr_mode=False,
	)


	if __name__ == "__main__":
	tyro.cli(main)