Spaces:
Running
Running
File size: 8,914 Bytes
c33d1b8 7fb4aa3 ca63a5c 7388ff5 ca63a5c b56d61a ca63a5c 89befaf 7fb4aa3 153a6a8 c33d1b8 7fb4aa3 b7eedf7 7fb4aa3 e3b8f47 7fb4aa3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import gradio as gr
# import spaces
import os
import subprocess
import torch
print(torch.__version__)
print(torch.version.cuda)
def install_cuda_toolkit():
# CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
# CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"
CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])
os.environ["CUDA_HOME"] = "/usr/local/cuda"
os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
os.environ["CUDA_HOME"],
"" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
)
# Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
install_cuda_toolkit()
os.system('pip install ./thirdparty/DROID-SLAM')
os.system('pip install ./thirdparty/DROID-SLAM/thirdparty/lietorch')
import numpy as np
from easydict import EasyDict
from scripts.scripts_test_video.detect_track_video import detect_track_video
from scripts.scripts_test_video.hawor_video import hawor_motion_estimation, hawor_infiller
from scripts.scripts_test_video.hawor_slam import hawor_slam
from hawor.utils.process import get_mano_faces, run_mano, run_mano_left
from lib.eval_utils.custom_utils import load_slam_cam
from lib.vis.run_vis2 import run_vis2_on_video, run_vis2_on_video_cam
def render_reconstruction(input_video, img_focal):
args = EasyDict()
args.video_path = input_video
args.input_type = 'file'
args.checkpoint = './weights/hawor/checkpoints/hawor.ckpt'
args.infiller_weight = './weights/hawor/checkpoints/infiller.pt'
args.vis_mode = 'world'
args.img_focal = img_focal
start_idx, end_idx, seq_folder, imgfiles = detect_track_video(args)
frame_chunks_all, img_focal = hawor_motion_estimation(args, start_idx, end_idx, seq_folder)
hawor_slam(args, start_idx, end_idx)
slam_path = os.path.join(seq_folder, f"SLAM/hawor_slam_w_scale_{start_idx}_{end_idx}.npz")
R_w2c_sla_all, t_w2c_sla_all, R_c2w_sla_all, t_c2w_sla_all = load_slam_cam(slam_path)
pred_trans, pred_rot, pred_hand_pose, pred_betas, pred_valid = hawor_infiller(args, start_idx, end_idx, frame_chunks_all)
# vis sequence for this video
hand2idx = {
"right": 1,
"left": 0
}
vis_start = 0
vis_end = pred_trans.shape[1] - 1
# get faces
faces = get_mano_faces()
faces_new = np.array([[92, 38, 234],
[234, 38, 239],
[38, 122, 239],
[239, 122, 279],
[122, 118, 279],
[279, 118, 215],
[118, 117, 215],
[215, 117, 214],
[117, 119, 214],
[214, 119, 121],
[119, 120, 121],
[121, 120, 78],
[120, 108, 78],
[78, 108, 79]])
faces_right = np.concatenate([faces, faces_new], axis=0)
# get right hand vertices
hand = 'right'
hand_idx = hand2idx[hand]
pred_glob_r = run_mano(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
right_verts = pred_glob_r['vertices'][0]
right_dict = {
'vertices': right_verts.unsqueeze(0),
'faces': faces_right,
}
# get left hand vertices
faces_left = faces_right[:,[0,2,1]]
hand = 'left'
hand_idx = hand2idx[hand]
pred_glob_l = run_mano_left(pred_trans[hand_idx:hand_idx+1, vis_start:vis_end], pred_rot[hand_idx:hand_idx+1, vis_start:vis_end], pred_hand_pose[hand_idx:hand_idx+1, vis_start:vis_end], betas=pred_betas[hand_idx:hand_idx+1, vis_start:vis_end])
left_verts = pred_glob_l['vertices'][0]
left_dict = {
'vertices': left_verts.unsqueeze(0),
'faces': faces_left,
}
R_x = torch.tensor([[1, 0, 0],
[0, -1, 0],
[0, 0, -1]]).float()
R_c2w_sla_all = torch.einsum('ij,njk->nik', R_x, R_c2w_sla_all)
t_c2w_sla_all = torch.einsum('ij,nj->ni', R_x, t_c2w_sla_all)
R_w2c_sla_all = R_c2w_sla_all.transpose(-1, -2)
t_w2c_sla_all = -torch.einsum("bij,bj->bi", R_w2c_sla_all, t_c2w_sla_all)
left_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, left_dict['vertices'].cpu())
right_dict['vertices'] = torch.einsum('ij,btnj->btni', R_x, right_dict['vertices'].cpu())
# Here we use aitviewer(https://github.com/eth-ait/aitviewer) for simple visualization.
if args.vis_mode == 'world':
output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
if not os.path.exists(output_pth):
os.makedirs(output_pth)
image_names = imgfiles[vis_start:vis_end]
print(f"vis {vis_start} to {vis_end}")
vis_video_path = run_vis2_on_video(left_dict, right_dict, output_pth, img_focal, image_names, R_c2w=R_c2w_sla_all[vis_start:vis_end], t_c2w=t_c2w_sla_all[vis_start:vis_end], interactive=False)
elif args.vis_mode == 'cam':
# output_pth = os.path.join(seq_folder, f"vis_{vis_start}_{vis_end}")
# if not os.path.exists(output_pth):
# os.makedirs(output_pth)
# image_names = imgfiles[vis_start:vis_end]
# print(f"vis {vis_start} to {vis_end}")
# run_vis2_on_video_cam(left_dict, right_dict, output_pth, img_focal, image_names, R_w2c=R_w2c_sla_all[vis_start:vis_end], t_w2c=t_w2c_sla_all[vis_start:vis_end])
raise NotImplementedError
return vis_video_path
# @spaces.GPU()
def run_wilow_model(image, conf, IoU_threshold=0.5):
img_cv2 = image[...,::-1]
return img_vis.astype(np.float32)/255.0, len(detections), None
header = ('''
<div class="embed_hidden" style="text-align: center;">
<h1> <b>HaWoR</b>: World-Space Hand Motion Reconstruction from Egocentric Videos</h1>
<h3>
<a href="" target="_blank" rel="noopener noreferrer">Jinglei Zhang</a><sup>1</sup>,
<a href="https://jiankangdeng.github.io/" target="_blank" rel="noopener noreferrer">Jiankang Deng</a><sup>2</sup>,
<br>
<a href="https://scholar.google.com/citations?user=syoPhv8AAAAJ&hl=en" target="_blank" rel="noopener noreferrer">Chao Ma</a><sup>1</sup>,
<a href="https://rolpotamias.github.io" target="_blank" rel="noopener noreferrer">Rolandos Alexandros Potamias</a><sup>2</sup>
</h3>
<h3>
<sup>1</sup>Shanghai Jiao Tong University;
<sup>2</sup>Imperial College London
</h3>
</div>
<div style="display:flex; gap: 0.3rem; justify-content: center; align-items: center;" align="center">
<a href='https://arxiv.org/abs/xxxx.xxxxx'><img src='https://img.shields.io/badge/Arxiv-xxxx.xxxxx-A42C25?style=flat&logo=arXiv&logoColor=A42C25'></a>
<a href=''><img src='https://img.shields.io/badge/Paper-PDF-yellow?style=flat&logo=arXiv&logoColor=yellow'></a>
<a href='https://hawor-project.github.io/'><img src='https://img.shields.io/badge/Project-Page-%23df5b46?style=flat&logo=Google%20chrome&logoColor=%23df5b46'></a>
<a href='https://github.com/ThunderVVV/HaWoR'><img src='https://img.shields.io/badge/GitHub-Code-black?style=flat&logo=github&logoColor=white'></a>
''')
with gr.Blocks(title="HaWoR: World-Space Hand Motion Reconstruction from Egocentric Videos", css=".gradio-container") as demo:
gr.Markdown(header)
with gr.Row():
with gr.Column():
input_video = gr.Video(label="Input video", sources=["upload"])
img_focal = gr.Number(label="Focal Length", value=600)
# threshold = gr.Slider(value=0.3, minimum=0.05, maximum=0.95, step=0.05, label='Detection Confidence Threshold')
#nms = gr.Slider(value=0.5, minimum=0.05, maximum=0.95, step=0.05, label='IoU NMS Threshold')
submit = gr.Button("Submit", variant="primary")
with gr.Column():
reconstruction = gr.Video(label="Reconstruction",show_download_button=True)
# hands_detected = gr.Textbox(label="Hands Detected")
submit.click(fn=render_reconstruction, inputs=[input_video, img_focal], outputs=[reconstruction])
with gr.Row():
example_images = gr.Examples([
['./example/video_0.mp4']
],
inputs=input_video)
demo.launch(debug=True) |