RoyYang0714 commited on
Commit
8fc06b5
·
1 Parent(s): c8fc399

feat: Add 3D-MOOD gradio demo.

Browse files
Files changed (1) hide show
  1. app.py +165 -6
app.py CHANGED
@@ -1,16 +1,175 @@
 
 
1
  import gradio as gr
2
  import spaces
 
 
 
 
3
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- zero = torch.Tensor([0]).cuda()
6
- print(zero.device) # <-- 'cpu' 🤔
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  @spaces.GPU
10
- def greet(n):
11
- print(zero.device) # <-- 'cuda:0' 🤗
12
- return f"Hello {zero + n} Tensor"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
- demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  demo.launch()
 
1
+ """Gradio Demo for 3D-MOOD."""
2
+
3
  import gradio as gr
4
  import spaces
5
+ import gc
6
+ import os
7
+
8
+ import numpy as np
9
  import torch
10
+ from PIL import Image
11
+
12
+ from vis4d.data.transforms.base import compose
13
+ from vis4d.data.transforms.normalize import NormalizeImages
14
+ from vis4d.data.transforms.resize import ResizeImages, ResizeIntrinsics
15
+ from vis4d.data.transforms.to_tensor import ToTensor
16
+ from vis4d.common.ckpt import load_model_checkpoint
17
+ from vis4d.op.fpp.fpn import FPN
18
+ from vis4d.vis.image.functional import imshow_bboxes3d
19
+
20
+ from opendet3d.data.transforms.pad import CenterPadImages, CenterPadIntrinsics
21
+ from opendet3d.data.transforms.resize import GenResizeParameters
22
+ from opendet3d.model.detect3d.grounding_dino_3d import GroundingDINO3D
23
+ from opendet3d.op.base.swin import SwinTransformer
24
+ from opendet3d.op.detect3d.grounding_dino_3d import (
25
+ GroundingDINO3DCoder,
26
+ GroundingDINO3DHead,
27
+ RoI2Det3D,
28
+ UniDepthHead,
29
+ )
30
+ from opendet3d.op.fpp.channel_mapper import ChannelMapper
31
+
32
+
33
+ def get_3d_mood_swin_base(
34
+ max_per_image: int = 100, score_thres: float = 0.1
35
+ ) -> GroundingDINO3D:
36
+ """Get the config of Swin-Base."""
37
+ basemodel = SwinTransformer(
38
+ convert_weights=True,
39
+ pretrain_img_size=384,
40
+ embed_dims=128,
41
+ depths=[2, 2, 18, 2],
42
+ num_heads=[4, 8, 16, 32],
43
+ window_size=12,
44
+ drop_path_rate=0.3,
45
+ out_indices=(0, 1, 2, 3),
46
+ )
47
+
48
+ neck = ChannelMapper(
49
+ in_channels=[256, 512, 1024],
50
+ out_channels=256,
51
+ num_outs=4,
52
+ kernel_size=1,
53
+ norm="GroupNorm",
54
+ num_groups=32,
55
+ activation=None,
56
+ bias=True,
57
+ )
58
 
59
+ depth_fpn = FPN(
60
+ in_channels_list=[128, 256, 512, 1024],
61
+ out_channels=256,
62
+ extra_blocks=None,
63
+ start_index=0,
64
+ )
65
+
66
+ depth_head = UniDepthHead(input_dims=[256, 256, 256, 256])
67
+
68
+ box_coder = GroundingDINO3DCoder()
69
+
70
+ bbox3d_head = GroundingDINO3DHead(box_coder=box_coder)
71
+
72
+ roi2det3d = RoI2Det3D(max_per_img=max_per_image, score_threshold=score_thres)
73
+
74
+ return GroundingDINO3D(
75
+ basemodel=basemodel,
76
+ neck=neck,
77
+ bbox3d_head=bbox3d_head,
78
+ roi2det3d=roi2det3d,
79
+ fpn=depth_fpn,
80
+ depth_head=depth_head,
81
+ )
82
 
83
 
84
  @spaces.GPU
85
+ def run_3d_mood(image, fx, fy, cx, cy):
86
+ """Run 3D-MOOD demo."""
87
+
88
+ gc.collect()
89
+
90
+ device = "cuda" if torch.cuda.is_available() else "cpu"
91
+
92
+ # Data
93
+ images = image.astype(np.float32)[None, ...]
94
+ intrinsics = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
95
+
96
+ data_dict = {
97
+ "images": images,
98
+ "original_images": images,
99
+ "input_hw": (images.shape[1], images.shape[2]),
100
+ "original_hw": (images.shape[1], images.shape[2]),
101
+ "intrinsics": intrinsics,
102
+ "original_intrinsics": intrinsics,
103
+ }
104
+ # Transform
105
+ preprocess_transforms = compose(
106
+ transforms=[
107
+ GenResizeParameters(shape=(800, 1333)),
108
+ ResizeImages(),
109
+ ResizeIntrinsics(),
110
+ NormalizeImages(),
111
+ CenterPadImages(stride=1, shape=(800, 1333), update_input_hw=True),
112
+ CenterPadIntrinsics(),
113
+ ]
114
+ )
115
+
116
+ data = preprocess_transforms([data_dict])[0]
117
+
118
+ # Convert to Tensor
119
+ to_tensor = ToTensor()
120
+ data = to_tensor([data])[0]
121
+
122
+ # Model
123
+ model = get_3d_mood_swin_base().to(device)
124
+
125
+ load_model_checkpoint(
126
+ model,
127
+ weights="https://huggingface.co/RoyYang0714/3D-MOOD/resolve/main/gdino3d_swin-b_120e_omni3d_834c97.pt",
128
+ rev_keys=[(r"^model\.", ""), (r"^module\.", "")],
129
+ )
130
+
131
+ model.eval()
132
+
133
+ # Run predict
134
+ with torch.no_grad():
135
+ boxes, boxes3d, scores, class_ids, depth_maps, categories = model(
136
+ images=data["images"].to(device),
137
+ input_hw=[data["input_hw"]],
138
+ original_hw=[data["original_hw"]],
139
+ intrinsics=data["intrinsics"].to(device)[None],
140
+ padding=[data["padding"]],
141
+ input_texts=["sofa"],
142
+ )
143
+
144
+ # Save the prediction for visualization
145
+ imshow_bboxes3d(
146
+ image=data["original_images"].cpu(),
147
+ boxes3d=[b.cpu() for b in boxes3d],
148
+ intrinsics=data["original_intrinsics"].cpu().numpy(),
149
+ scores=[s.cpu() for s in scores],
150
+ class_ids=[c.cpu() for c in class_ids],
151
+ class_id_mapping={0: "sofa"},
152
+ file_path="./output.png",
153
+ )
154
+
155
+ output = Image.open("./output.png")
156
+ os.remove("./output.png")
157
+
158
+ return output
159
 
160
 
161
+ demo = gr.Interface(
162
+ fn=run_3d_mood,
163
+ inputs=[
164
+ gr.Image(),
165
+ "fx",
166
+ "fy",
167
+ "cx",
168
+ "cy",
169
+ ],
170
+ examples=["rgb.png", 518.8579, 519.4696, 325.58246, 253.73616],
171
+ outputs="image",
172
+ title="3D-MOOD with Swin-B.",
173
+ description="3D-MOOD: Lifting 2D to 3D for Monocular Open-Set Object Detection",
174
+ )
175
  demo.launch()