3DStylizationLRM

Runtime error

App Files Files Community

ipekoztas commited on Jun 8

Commit

b7c5eaf

1 Parent(s): 7860b91

Code upload.

Browse files

Files changed (11) hide show

README.md +1 -1
app.py +93 -78
requirements.txt +11 -3
src/data/objaverse.py +17 -21
src/data/objaverse_zero123plus.py +124 -0
src/model.py +2 -9
src/model_mesh.py +2 -2
src/models/decoder/transformer.py +55 -16
src/models/lrm.py +37 -4
src/models/lrm_mesh.py +26 -9
src/utils/infer_util.py +14 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: InstantMesh
 emoji: 📚
 colorFrom: indigo
 colorTo: green

 ---
+title: 3D Stylization LRM
 emoji: 📚
 colorFrom: indigo
 colorTo: green

app.py CHANGED Viewed

@@ -30,6 +30,7 @@ from huggingface_hub import hf_hub_download
 import gradio as gr
 def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
     """
     Get the rendering camera parameters.
@@ -90,7 +91,7 @@ if cuda_path:
 else:
     print("CUDA installation not found")
-config_path = 'configs/instant-mesh-large.yaml'
 config = OmegaConf.load(config_path)
 config_name = os.path.basename(config_path).replace('.yaml', '')
 model_config = config.model_config
@@ -120,7 +121,7 @@ pipeline = pipeline.to(device)
 # load reconstruction model
 print('Loading reconstruction model ...')
-model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_mesh_large.ckpt", repo_type="model")
 model = instantiate_from_config(model_config)
 state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
 state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
@@ -134,6 +135,10 @@ print('Loading Finished!')
 def check_input_image(input_image):
     if input_image is None:
         raise gr.Error("No image uploaded!")
 def preprocess(input_image, do_remove_background):
@@ -158,7 +163,7 @@ def generate_mvs(input_image, sample_steps, sample_seed):
         num_inference_steps=sample_steps
     ).images[0]
-    show_image = np.asarray(z123_image, dtype=np.uint8)
     show_image = torch.from_numpy(show_image)     # (960, 640, 3)
     show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
     show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
@@ -166,66 +171,53 @@ def generate_mvs(input_image, sample_steps, sample_seed):
     return z123_image, show_image
 @spaces.GPU
-def make3d(images):
     global model
     if IS_FLEXICUBES:
         model.init_flexicubes_geometry(device, use_renderer=False)
-    model = model.eval()
-    images = np.asarray(images, dtype=np.float32) / 255.0
-    images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()     # (3, 960, 640)
-    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)        # (6, 3, 320, 320)
     input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)
-    render_cameras = get_render_cameras(batch_size=1, radius=2.5, is_flexicubes=IS_FLEXICUBES).to(device)
     images = images.unsqueeze(0).to(device)
     images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
-    mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
-    print(mesh_fpath)
     mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
     mesh_dirname = os.path.dirname(mesh_fpath)
-    video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4")
     mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")
     with torch.no_grad():
-        # get triplane
-        planes = model.forward_planes(images, input_cameras)
-        # # get video
-        # chunk_size = 20 if IS_FLEXICUBES else 1
-        # render_size = 384
-        # frames = []
-        # for i in tqdm(range(0, render_cameras.shape[1], chunk_size)):
-        #     if IS_FLEXICUBES:
-        #         frame = model.forward_geometry(
-        #             planes,
-        #             render_cameras[:, i:i+chunk_size],
-        #             render_size=render_size,
-        #         )['img']
-        #     else:
-        #         frame = model.synthesizer(
-        #             planes,
-        #             cameras=render_cameras[:, i:i+chunk_size],
-        #             render_size=render_size,
-        #         )['images_rgb']
-        #     frames.append(frame)
-        # frames = torch.cat(frames, dim=1)
-        # images_to_video(
-        #     frames[0],
-        #     video_fpath,
-        #     fps=30,
-        # )
-        # print(f"Video saved to {video_fpath}")
-        # get mesh
         mesh_out = model.extract_mesh(
             planes,
             use_texture_map=False,
@@ -234,52 +226,40 @@ def make3d(images):
         vertices, faces, vertex_colors = mesh_out
         vertices = vertices[:, [1, 2, 0]]
         save_glb(vertices, faces, vertex_colors, mesh_glb_fpath)
         save_obj(vertices, faces, vertex_colors, mesh_fpath)
         print(f"Mesh saved to {mesh_fpath}")
     return mesh_fpath, mesh_glb_fpath
 _HEADER_ = '''
-<h2><b>Official 🤗 Gradio Demo</b></h2><h2><a href='https://github.com/TencentARC/InstantMesh' target='_blank'><b>InstantMesh: Efficient 3D Mesh Generation from a Single Image with Sparse-view Large Reconstruction Models</b></a></h2>
-**InstantMesh** is a feed-forward framework for efficient 3D mesh generation from a single image based on the LRM/Instant3D architecture.
-Code: <a href='https://github.com/TencentARC/InstantMesh' target='_blank'>GitHub</a>. Techenical report: <a href='https://arxiv.org/abs/2404.07191' target='_blank'>ArXiv</a>.
-❗️❗️❗️**Important Notes:**
-- Our demo can export a .obj mesh with vertex colors or a .glb mesh now. If you prefer to export a .obj mesh with a **texture map**, please refer to our <a href='https://github.com/TencentARC/InstantMesh?tab=readme-ov-file#running-with-command-line' target='_blank'>Github Repo</a>.
-- The 3D mesh generation results highly depend on the quality of generated multi-view images. Please try a different **seed value** if the result is unsatisfying (Default: 42).
 '''
 _CITE_ = r"""
-If InstantMesh is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/InstantMesh' target='_blank'>Github Repo</a>. Thanks! [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/InstantMesh?style=social)](https://github.com/TencentARC/InstantMesh)
 ---
 📝 **Citation**
 If you find our work useful for your research or applications, please cite using this bibtex:
 ```bibtex
-@article{xu2024instantmesh,
-  title={InstantMesh: Efficient 3D Mesh Generation from a Single Image with Sparse-view Large Reconstruction Models},
-  author={Xu, Jiale and Cheng, Weihao and Gao, Yiming and Wang, Xintao and Gao, Shenghua and Shan, Ying},
-  journal={arXiv preprint arXiv:2404.07191},
-  year={2024}
 }
 ```
-📋 **License**
-Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/spaces/TencentARC/InstantMesh/blob/main/LICENSE) for details.
-📧 **Contact**
-If you have any questions, feel free to open a discussion or contact us at <b>[email protected]</b>.
 """
 with gr.Blocks() as demo:
     gr.Markdown(_HEADER_)
     with gr.Row(variant="panel"):
@@ -294,6 +274,13 @@ with gr.Blocks() as demo:
                     type="pil",
                     elem_id="content_image",
                 )
                 processed_image = gr.Image(
                     label="Processed Image",
                     image_mode="RGBA",
@@ -317,6 +304,22 @@ with gr.Blocks() as demo:
                         step=5
                     )
             with gr.Row():
                 submit = gr.Button("Generate", elem_id="generate", variant="primary")
@@ -330,6 +333,16 @@ with gr.Blocks() as demo:
                     cache_examples=False,
                     examples_per_page=16
                 )
         with gr.Column():
@@ -372,19 +385,21 @@ with gr.Blocks() as demo:
     mv_images = gr.State()
     submit.click(fn=check_input_image, inputs=[input_image]).success(
         fn=preprocess,
         inputs=[input_image, do_remove_background],
         outputs=[processed_image],
     ).success(
         fn=generate_mvs,
         inputs=[processed_image, sample_steps, sample_seed],
-        outputs=[mv_images, mv_show_images]
     ).success(
         fn=make3d,
-        inputs=[mv_images],
-        outputs=[output_model_obj, output_model_glb]
     )
 demo.launch()

 import gradio as gr
 def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
     """
     Get the rendering camera parameters.
 else:
     print("CUDA installation not found")
+config_path = 'configs/instant-nerf-large.yaml'
 config = OmegaConf.load(config_path)
 config_name = os.path.basename(config_path).replace('.yaml', '')
 model_config = config.model_config
 # load reconstruction model
 print('Loading reconstruction model ...')
+model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_nerf_large.ckpt", repo_type="model")
 model = instantiate_from_config(model_config)
 state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
 state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
 def check_input_image(input_image):
     if input_image is None:
         raise gr.Error("No image uploaded!")
+def check_style_image(style_image):
+    if style_image is None:
+        raise gr.Error("No style image uploaded!")
 def preprocess(input_image, do_remove_background):
         num_inference_steps=sample_steps
     ).images[0]
+    show_image = np.asarray(z123_image, dtype=np.uint8).copy()
     show_image = torch.from_numpy(show_image)     # (960, 640, 3)
     show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
     show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
     return z123_image, show_image
 @spaces.GPU
+def make3d(mv_images, style_image, alpha, style_layers):
+    """
+    mv_images: single multi-view image (pil or numpy)
+    style_image: PIL image
+    alpha: float
+    style_layers: int
+    """
     global model
+    # Save the uploaded style image to a temporary file, so the model can read it from disk
+    style_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
+    style_image.save(style_path)
     if IS_FLEXICUBES:
         model.init_flexicubes_geometry(device, use_renderer=False)
+    images = np.asarray(mv_images, dtype=np.float32) / 255.0
+    images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()  # (3, 960, 640)
+    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)     # (6, 3, 320, 320)
     input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)
+    render_cameras = get_render_cameras(
+        batch_size=1,
+        radius=2.5,
+        is_flexicubes=IS_FLEXICUBES
+    ).to(device)
     images = images.unsqueeze(0).to(device)
     images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
+    mesh_fpath = tempfile.NamedTemporaryFile(suffix=".obj", delete=False).name
     mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
     mesh_dirname = os.path.dirname(mesh_fpath)
     mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")
     with torch.no_grad():
+        # get triplane, now passing style_path, alpha, style_layers
+        planes = model.forward_planes(
+            images,
+            input_cameras,
+            style_path,
+            float(alpha),
+            int(style_layers),
+        )
+        # extract mesh
         mesh_out = model.extract_mesh(
             planes,
             use_texture_map=False,
         vertices, faces, vertex_colors = mesh_out
         vertices = vertices[:, [1, 2, 0]]
         save_glb(vertices, faces, vertex_colors, mesh_glb_fpath)
         save_obj(vertices, faces, vertex_colors, mesh_fpath)
         print(f"Mesh saved to {mesh_fpath}")
     return mesh_fpath, mesh_glb_fpath
 _HEADER_ = '''
+<h2><b>3DStylizationLRM</b></h2>
+This demo lets you provide a content image, a style image, an alpha blending value, and the number of style layers to inject. It will generate 3D geometry stylized accordingly.
+❗️❗️❗️ **Notes:**
+- Content image background can be removed automatically.
+- Adjust the **Alpha** slider to control style blending strength.
+- Adjust **Style Layers** to choose how many layers of style to inject.
 '''
 _CITE_ = r"""
+If 3D Stylization LRM is helpful, please help to ⭐ the <a href='https://github.com/ipekoztas/3D-Stylization-LRM' target='_blank'>Github Repo</a>. Thanks!
 ---
 📝 **Citation**
 If you find our work useful for your research or applications, please cite using this bibtex:
 ```bibtex
+@article{oztas20253dstylizationlargereconstruction,
+    title={3D Stylization via Large Reconstruction Model},
+    author={Ipek Oztas and Duygu Ceylan and Aysegul Dundar},
+    journal={https://arxiv.org/abs/2504.21836},
+    year={2025}
 }
 ```
+📋 **License**
+Apache-2.0 LICENSE.
 """
 with gr.Blocks() as demo:
     gr.Markdown(_HEADER_)
     with gr.Row(variant="panel"):
                     type="pil",
                     elem_id="content_image",
                 )
+                # Style Image Upload
+                style_image = gr.Image(
+                    label="Style Image",
+                    image_mode="RGB",
+                    type="pil",
+                    elem_id="style_image",
+                )
                 processed_image = gr.Image(
                     label="Processed Image",
                     image_mode="RGBA",
                         step=5
                     )
+            with gr.Row():
+                alpha = gr.Slider(
+                    label="Alpha Value",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.7,
+                    step=0.01,
+                )
+                style_layers = gr.Slider(
+                    label="Style Layers",
+                    minimum=1,
+                    maximum=10,
+                    value=4,
+                    step=1,
+                )
             with gr.Row():
                 submit = gr.Button("Generate", elem_id="generate", variant="primary")
                     cache_examples=False,
                     examples_per_page=16
                 )
+            with gr.Row(variant="panel"):
+                gr.Examples(
+                    examples=[
+                        os.path.join("styles", img_name) for img_name in sorted(os.listdir("styles"))
+                    ],
+                    inputs=[input_image],
+                    label="Styles",
+                    cache_examples=False,
+                    examples_per_page=16
+                )
         with gr.Column():
     mv_images = gr.State()
+    # Chain of actions:
     submit.click(fn=check_input_image, inputs=[input_image]).success(
+        fn=check_style_image, inputs=[style_image]
+    ).success(
         fn=preprocess,
         inputs=[input_image, do_remove_background],
         outputs=[processed_image],
     ).success(
         fn=generate_mvs,
         inputs=[processed_image, sample_steps, sample_seed],
+        outputs=[mv_images, mv_show_images],
     ).success(
         fn=make3d,
+        inputs=[mv_images, style_image, alpha, style_layers],
+        outputs=[output_model_obj, output_model_glb],
     )
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,7 +1,12 @@
 torch==2.1.0
 torchvision==0.16.0
 torchaudio==2.1.0
 pytorch-lightning==2.1.2
 einops
 omegaconf
 deepspeed
@@ -12,12 +17,15 @@ tensorboard
 PyMCubes
 trimesh
 rembg
-transformers==4.34.1
-diffusers==0.19.3
 bitsandbytes
 imageio[ffmpeg]
 xatlas
 plyfile
 xformers==0.0.22.post7
 git+https://github.com/NVlabs/nvdiffrast/
-huggingface-hub

+pydantic==2.10.6
+gradio==4.44.1
+gradio-client==1.3.0
+huggingface-hub==0.25.2
 torch==2.1.0
 torchvision==0.16.0
 torchaudio==2.1.0
 pytorch-lightning==2.1.2
 einops
 omegaconf
 deepspeed
 PyMCubes
 trimesh
 rembg
+transformers==4.39.3
+diffusers==0.27.0
+tokenizers==0.15.2
 bitsandbytes
 imageio[ffmpeg]
 xatlas
 plyfile
 xformers==0.0.22.post7
 git+https://github.com/NVlabs/nvdiffrast/
+onnxruntime

src/data/objaverse.py CHANGED Viewed

@@ -22,7 +22,7 @@ from src.utils.train_util import instantiate_from_config
 from src.utils.camera_util import (
     FOV_to_intrinsics,
     center_looking_at_camera_pose,
-    get_surrounding_views,
 )
@@ -78,7 +78,7 @@ class ObjaverseData(Dataset):
         input_image_dir='rendering_random_32views',
         target_image_dir='rendering_random_32views',
         input_view_num=6,
-        target_view_num=2,
         total_view_n=32,
         fov=50,
         camera_rotation=True,
@@ -99,7 +99,7 @@ class ObjaverseData(Dataset):
         paths = filtered_dict['good_objs']
         self.paths = paths
-        self.depth_scale = 4.0
         total_objects = len(self.paths)
         print('============= length of dataset %d =============' % len(self.paths))
@@ -122,7 +122,6 @@ class ObjaverseData(Dataset):
         return image, alpha
     def __getitem__(self, index):
-        # load data
         while True:
             input_image_path = os.path.join(self.root_dir, self.input_image_dir, self.paths[index])
             target_image_path = os.path.join(self.root_dir, self.target_image_dir, self.paths[index])
@@ -212,7 +211,7 @@ class ObjaverseData(Dataset):
         # random scaling
         if np.random.rand() < 0.5:
-            scale = np.random.uniform(0.8, 1.0)
             c2ws[:, :3, 3] *= scale
             depths *= scale
@@ -221,11 +220,11 @@ class ObjaverseData(Dataset):
         Ks = K.unsqueeze(0).repeat(self.input_view_num + self.target_view_num, 1, 1).float()
         data = {
-            'input_images': images[:self.input_view_num],     # (6, 3, H, W)
             'input_alphas': alphas[:self.input_view_num],           # (6, 1, H, W)
             'input_depths': depths[:self.input_view_num],           # (6, 1, H, W)
             'input_normals': normals[:self.input_view_num],         # (6, 3, H, W)
-            'input_c2ws': c2ws_input[:self.input_view_num],         # (6, 4, 4)
             'input_Ks': Ks[:self.input_view_num],                   # (6, 3, 3)
             # lrm generator input and supervision
@@ -235,8 +234,6 @@ class ObjaverseData(Dataset):
             'target_normals': normals[self.input_view_num:],        # (V, 3, H, W)
             'target_c2ws': c2ws[self.input_view_num:],              # (V, 4, 4)
             'target_Ks': Ks[self.input_view_num:],                  # (V, 3, 3)
-            'depth_available': 1,
         }
         return data
@@ -245,8 +242,8 @@ class ValidationData(Dataset):
     def __init__(self,
         root_dir='objaverse/',
         input_view_num=6,
-        input_image_size=256,
-        fov=50,
     ):
         self.root_dir = Path(root_dir)
         self.input_view_num = input_view_num
@@ -256,9 +253,9 @@ class ValidationData(Dataset):
         self.paths = sorted(os.listdir(self.root_dir))
         print('============= length of dataset %d =============' % len(self.paths))
-        cam_distance = 2.5
         azimuths = np.array([30, 90, 150, 210, 270, 330])
-        elevations = np.array([30, -20, 30, -20, 30, -20])
         azimuths = np.deg2rad(azimuths)
         elevations = np.deg2rad(elevations)
@@ -272,7 +269,7 @@ class ValidationData(Dataset):
         self.c2ws = c2ws.float()
         self.Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(6, 1, 1).float()
-        render_c2ws = get_surrounding_views(M=8, radius=cam_distance)
         render_Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(render_c2ws.shape[0], 1, 1)
         self.render_c2ws = render_c2ws.float()
         self.render_Ks = render_Ks.float()
@@ -303,7 +300,6 @@ class ValidationData(Dataset):
         input_image_path = os.path.join(self.root_dir, self.paths[index])
         '''background color, default: white'''
-        # color = np.random.uniform(0.48, 0.52)
         bkg_color = [1.0, 1.0, 1.0]
         image_list = []
@@ -314,14 +310,14 @@ class ValidationData(Dataset):
             image_list.append(image)
             alpha_list.append(alpha)
-        images = torch.stack(image_list, dim=0).float()                     # (6+V, 3, H, W)
-        alphas = torch.stack(alpha_list, dim=0).float()                 # (6+V, 1, H, W)
         data = {
-            'input_images': images,                 # (6, 3, H, W)
-            'input_alphas': alphas,             # (6, 1, H, W)
-            'input_c2ws': self.c2ws,            # (6, 4, 4)
-            'input_Ks': self.Ks,                # (6, 3, 3)
             'render_c2ws': self.render_c2ws,
             'render_Ks': self.render_Ks,

 from src.utils.camera_util import (
     FOV_to_intrinsics,
     center_looking_at_camera_pose,
+    get_circular_camera_poses,
 )
         input_image_dir='rendering_random_32views',
         target_image_dir='rendering_random_32views',
         input_view_num=6,
+        target_view_num=4,
         total_view_n=32,
         fov=50,
         camera_rotation=True,
         paths = filtered_dict['good_objs']
         self.paths = paths
+        self.depth_scale = 6.0
         total_objects = len(self.paths)
         print('============= length of dataset %d =============' % len(self.paths))
         return image, alpha
     def __getitem__(self, index):
         while True:
             input_image_path = os.path.join(self.root_dir, self.input_image_dir, self.paths[index])
             target_image_path = os.path.join(self.root_dir, self.target_image_dir, self.paths[index])
         # random scaling
         if np.random.rand() < 0.5:
+            scale = np.random.uniform(0.7, 1.1)
             c2ws[:, :3, 3] *= scale
             depths *= scale
         Ks = K.unsqueeze(0).repeat(self.input_view_num + self.target_view_num, 1, 1).float()
         data = {
+            'input_images': images[:self.input_view_num],           # (6, 3, H, W)
             'input_alphas': alphas[:self.input_view_num],           # (6, 1, H, W)
             'input_depths': depths[:self.input_view_num],           # (6, 1, H, W)
             'input_normals': normals[:self.input_view_num],         # (6, 3, H, W)
+            'input_c2ws': c2ws[:self.input_view_num],               # (6, 4, 4)
             'input_Ks': Ks[:self.input_view_num],                   # (6, 3, 3)
             # lrm generator input and supervision
             'target_normals': normals[self.input_view_num:],        # (V, 3, H, W)
             'target_c2ws': c2ws[self.input_view_num:],              # (V, 4, 4)
             'target_Ks': Ks[self.input_view_num:],                  # (V, 3, 3)
         }
         return data
     def __init__(self,
         root_dir='objaverse/',
         input_view_num=6,
+        input_image_size=320,
+        fov=30,
     ):
         self.root_dir = Path(root_dir)
         self.input_view_num = input_view_num
         self.paths = sorted(os.listdir(self.root_dir))
         print('============= length of dataset %d =============' % len(self.paths))
+        cam_distance = 4.0
         azimuths = np.array([30, 90, 150, 210, 270, 330])
+        elevations = np.array([20, -10, 20, -10, 20, -10])
         azimuths = np.deg2rad(azimuths)
         elevations = np.deg2rad(elevations)
         self.c2ws = c2ws.float()
         self.Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(6, 1, 1).float()
+        render_c2ws = get_circular_camera_poses(M=8, radius=cam_distance, elevation=20.0)
         render_Ks = FOV_to_intrinsics(self.fov).unsqueeze(0).repeat(render_c2ws.shape[0], 1, 1)
         self.render_c2ws = render_c2ws.float()
         self.render_Ks = render_Ks.float()
         input_image_path = os.path.join(self.root_dir, self.paths[index])
         '''background color, default: white'''
         bkg_color = [1.0, 1.0, 1.0]
         image_list = []
             image_list.append(image)
             alpha_list.append(alpha)
+        images = torch.stack(image_list, dim=0).float()
+        alphas = torch.stack(alpha_list, dim=0).float()
         data = {
+            'input_images': images,
+            'input_alphas': alphas,
+            'input_c2ws': self.c2ws,
+            'input_Ks': self.Ks,
             'render_c2ws': self.render_c2ws,
             'render_Ks': self.render_Ks,

src/data/objaverse_zero123plus.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import json
+import numpy as np
+import webdataset as wds
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from PIL import Image
+from pathlib import Path
+from src.utils.train_util import instantiate_from_config
+class DataModuleFromConfig(pl.LightningDataModule):
+    def __init__(
+        self,
+        batch_size=8,
+        num_workers=4,
+        train=None,
+        validation=None,
+        test=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.dataset_configs = dict()
+        if train is not None:
+            self.dataset_configs['train'] = train
+        if validation is not None:
+            self.dataset_configs['validation'] = validation
+        if test is not None:
+            self.dataset_configs['test'] = test
+    def setup(self, stage):
+        if stage in ['fit']:
+            self.datasets = dict((k, instantiate_from_config(self.dataset_configs[k])) for k in self.dataset_configs)
+        else:
+            raise NotImplementedError
+    def train_dataloader(self):
+        sampler = DistributedSampler(self.datasets['train'])
+        return wds.WebLoader(self.datasets['train'], batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def val_dataloader(self):
+        sampler = DistributedSampler(self.datasets['validation'])
+        return wds.WebLoader(self.datasets['validation'], batch_size=4, num_workers=self.num_workers, shuffle=False, sampler=sampler)
+    def test_dataloader(self):
+        return wds.WebLoader(self.datasets['test'], batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
+class ObjaverseData(Dataset):
+    def __init__(self,
+        root_dir='objaverse/',
+        meta_fname='valid_paths.json',
+        image_dir='rendering_zero123plus',
+        validation=False,
+    ):
+        self.root_dir = Path(root_dir)
+        self.image_dir = image_dir
+        with open(os.path.join(root_dir, meta_fname)) as f:
+            lvis_dict = json.load(f)
+        paths = []
+        for k in lvis_dict.keys():
+            paths.extend(lvis_dict[k])
+        self.paths = paths
+        total_objects = len(self.paths)
+        if validation:
+            self.paths = self.paths[-16:] # used last 16 as validation
+        else:
+            self.paths = self.paths[:-16]
+        print('============= length of dataset %d =============' % len(self.paths))
+    def __len__(self):
+        return len(self.paths)
+    def load_im(self, path, color):
+        pil_img = Image.open(path)
+        image = np.asarray(pil_img, dtype=np.float32) / 255.
+        alpha = image[:, :, 3:]
+        image = image[:, :, :3] * alpha + color * (1 - alpha)
+        image = torch.from_numpy(image).permute(2, 0, 1).contiguous().float()
+        alpha = torch.from_numpy(alpha).permute(2, 0, 1).contiguous().float()
+        return image, alpha
+    def __getitem__(self, index):
+        while True:
+            image_path = os.path.join(self.root_dir, self.image_dir, self.paths[index])
+            '''background color, default: white'''
+            bkg_color = [1., 1., 1.]
+            img_list = []
+            try:
+                for idx in range(7):
+                    img, alpha = self.load_im(os.path.join(image_path, '%03d.png' % idx), bkg_color)
+                    img_list.append(img)
+            except Exception as e:
+                print(e)
+                index = np.random.randint(0, len(self.paths))
+                continue
+            break
+        imgs = torch.stack(img_list, dim=0).float()
+        data = {
+            'cond_imgs': imgs[0],           # (3, H, W)
+            'target_imgs': imgs[1:],        # (6, 3, H, W)
+        }
+        return data

src/model.py CHANGED Viewed

@@ -295,16 +295,9 @@ class MVRecon(pl.LightningModule):
         params = []
-        lrm_params_fast, lrm_params_slow = [], []
-        for n, p in self.lrm_generator.named_parameters():
-            if 'adaLN_modulation' in n or 'camera_embedder' in n:
-                lrm_params_fast.append(p)
-            else:
-                lrm_params_slow.append(p)
-        params.append({"params": lrm_params_fast, "lr": lr, "weight_decay": 0.01 })
-        params.append({"params": lrm_params_slow, "lr": lr / 10.0, "weight_decay": 0.01 })
         optimizer = torch.optim.AdamW(params, lr=lr, betas=(0.90, 0.95))
-        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 3000, eta_min=lr/4)
         return {'optimizer': optimizer, 'lr_scheduler': scheduler}

         params = []
+        params.append({"params": self.lrm_generator.parameters(), "lr": lr, "weight_decay": 0.01 })
         optimizer = torch.optim.AdamW(params, lr=lr, betas=(0.90, 0.95))
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 3000, eta_min=lr/10)
         return {'optimizer': optimizer, 'lr_scheduler': scheduler}

src/model_mesh.py CHANGED Viewed

@@ -56,7 +56,7 @@ class MVRecon(pl.LightningModule):
                         if 'weight' in k:
                             sd_fc[k.replace('net.', 'net_sdf.')] = -v[0:1]
                         else:
-                            sd_fc[k.replace('net.', 'net_sdf.')] = 3.0 - v[0:1]
                         sd_fc[k.replace('net.', 'net_rgb.')] = v[1:4]
                     else:
                         sd_fc[k.replace('net.', 'net_sdf.')] = v
@@ -274,7 +274,7 @@ class MVRecon(pl.LightningModule):
         loss_reg = sdf_reg_loss_entropy + flexicubes_surface_reg + flexicubes_weights_reg
-        loss = loss_mse + loss_lpips + loss_mask + loss_normal + loss_reg
         prefix = 'train'
         loss_dict = {}

                         if 'weight' in k:
                             sd_fc[k.replace('net.', 'net_sdf.')] = -v[0:1]
                         else:
+                            sd_fc[k.replace('net.', 'net_sdf.')] = 10.0 - v[0:1]
                         sd_fc[k.replace('net.', 'net_rgb.')] = v[1:4]
                     else:
                         sd_fc[k.replace('net.', 'net_sdf.')] = v
         loss_reg = sdf_reg_loss_entropy + flexicubes_surface_reg + flexicubes_weights_reg
+        loss = loss_mse + loss_lpips + loss_mask + loss_depth + loss_normal + loss_reg
         prefix = 'train'
         loss_dict = {}

src/models/decoder/transformer.py CHANGED Viewed

@@ -53,14 +53,37 @@ class BasicTransformerBlock(nn.Module):
             nn.Dropout(mlp_drop),
         )
-    def forward(self, x, cond):
-        # x: [N, L, D]
-        # cond: [N, L_cond, D_cond]
-        x = x + self.cross_attn(self.norm1(x), cond, cond)[0]
-        before_sa = self.norm2(x)
-        x = x + self.self_attn(before_sa, before_sa, before_sa)[0]
-        x = x + self.mlp(self.norm3(x))
-        return x
 class TriplaneTransformer(nn.Module):
@@ -98,18 +121,34 @@ class TriplaneTransformer(nn.Module):
         ])
         self.norm = nn.LayerNorm(inner_dim, eps=eps)
         self.deconv = nn.ConvTranspose2d(inner_dim, triplane_dim, kernel_size=2, stride=2, padding=0)
-    def forward(self, image_feats):
-        # image_feats: [N, L_cond, D_cond]
-        N = image_feats.shape[0]
         H = W = self.triplane_low_res
         L = 3 * H * W
         x = self.pos_embed.repeat(N, 1, 1)  # [N, L, D]
-        for layer in self.layers:
-            x = layer(x, image_feats)
-        x = self.norm(x)
         # separate each plane and apply deconv
         x = x.view(N, 3, H, W, -1)

             nn.Dropout(mlp_drop),
         )
+    def forward(self, x, cond, i, alpha, content_layers):
+        # x: [N, L, D] or [x1, x2]
+        # cond: [content_feats] or [content_feats, style_feats]
+        if len(cond) == 2:
+            # Style injection mode
+            x1, x2 = x[0], x[1]
+            content, style = cond[0], cond[1]
+            if i <= content_layers:
+                x1 = x1 + self.cross_attn(self.norm1(x1), content, content)[0]
+            else:
+                x1 = x1 + (1-alpha)*self.cross_attn(self.norm1(x1), content, content)[0] + (alpha)*self.cross_attn(self.norm1(x1), style, style)[0]
+            x2 = x2 + self.cross_attn(self.norm1(x2), style, style)[0]
+            before_sa1 = self.norm2(x1)
+            before_sa2 = self.norm2(x2)
+            x1 = x1 + self.self_attn(before_sa1, before_sa1, before_sa1)[0]
+            x2 = x2 + self.self_attn(before_sa2, before_sa2, before_sa2)[0]
+            x1 = x1 + self.mlp(self.norm3(x1))
+            x2 = x2 + self.mlp(self.norm3(x2))
+            return [x1, x2]
+        else:
+            # No style, only content
+            x1 = x[0] if isinstance(x, list) else x
+            content = cond[0]
+            x1 = x1 + self.cross_attn(self.norm1(x1), content, content)[0]
+            before_sa1 = self.norm2(x1)
+            x1 = x1 + self.self_attn(before_sa1, before_sa1, before_sa1)[0]
+            x1 = x1 + self.mlp(self.norm3(x1))
+            return [x1]
 class TriplaneTransformer(nn.Module):
         ])
         self.norm = nn.LayerNorm(inner_dim, eps=eps)
         self.deconv = nn.ConvTranspose2d(inner_dim, triplane_dim, kernel_size=2, stride=2, padding=0)
+        self.num_layers = num_layers
+    def forward(self, image_feats, alpha, style_layers):
+        # image_feats: [content_feats] or [content_feats, style_feats]
+        N = image_feats[0].shape[0]
         H = W = self.triplane_low_res
         L = 3 * H * W
+        content_layers = self.num_layers - style_layers
         x = self.pos_embed.repeat(N, 1, 1)  # [N, L, D]
+        i = 1
+        if len(image_feats) == 2:
+            # Style injection mode
+            for layer in self.layers:
+                if i == 1:
+                    x = layer([x, x], image_feats, i, alpha, content_layers)
+                else:
+                    x = layer(x, image_feats, i, alpha, content_layers)
+                i += 1
+            x = self.norm(x[0])
+        else:
+            # No style, only content
+            for layer in self.layers:
+                if i == 1:
+                    x = layer([x], image_feats, i, alpha, content_layers)
+                else:
+                    x = layer(x, image_feats, i, alpha, content_layers)
+                i += 1
+            x = self.norm(x[0])
         # separate each plane and apply deconv
         x = x.view(N, 3, H, W, -1)

src/models/lrm.py CHANGED Viewed

@@ -18,6 +18,7 @@ import torch.nn as nn
 import mcubes
 import nvdiffrast.torch as dr
 from einops import rearrange, repeat
 from .encoder.dino_wrapper import DinoWrapper
 from .decoder.transformer import TriplaneTransformer
@@ -65,19 +66,46 @@ class InstantNeRF(nn.Module):
             samples_per_ray=rendering_samples_per_ray,
         )
-    def forward_planes(self, images, cameras):
         # images: [B, V, C_img, H_img, W_img]
         # cameras: [B, V, 16]
         B = images.shape[0]
         # encode images
         image_feats = self.encoder(images, cameras)
         image_feats = rearrange(image_feats, '(b v) l d -> b (v l) d', b=B)
         # transformer generating planes
-        planes = self.transformer(image_feats)
         return planes
     def forward(self, images, cameras, render_cameras, render_size: int):
         # images: [B, V, C_img, H_img, W_img]
@@ -125,7 +153,12 @@ class InstantNeRF(nn.Module):
                 sample_tex_pose_list.append(tex_pos_one_shape)
             tex_pos = torch.cat(sample_tex_pose_list, dim=0)
-        tex_feat = self.synthesizer.forward_points(planes, tex_pos)['rgb']
         if hard_mask is not None:
             final_tex_feat = torch.zeros(

 import mcubes
 import nvdiffrast.torch as dr
 from einops import rearrange, repeat
+from PIL import Image
 from .encoder.dino_wrapper import DinoWrapper
 from .decoder.transformer import TriplaneTransformer
             samples_per_ray=rendering_samples_per_ray,
         )
+    def forward_planes(self, images, cameras, style, alpha, style_layers):
         # images: [B, V, C_img, H_img, W_img]
         # cameras: [B, V, 16]
         B = images.shape[0]
+        style_feats = None
+        if style is not None:
+            style_img = np.asarray(Image.open(style), dtype=np.float32) / 255.0
+            if style_img.ndim == 2:  # Handle depth image
+                style_img = np.stack([style_img] * 3, axis=-1)
+            style_img = torch.from_numpy(style_img).permute(2, 0, 1).contiguous().float()
+            style_img = torch.nn.functional.interpolate(
+                style_img.unsqueeze(0), size=(320, 320), mode='bilinear', align_corners=False
+            )  # Shape: [1, 3, 320, 320]
+            style_img = style_img.unsqueeze(1)
+            style_img = style_img.to(images.device) # torch.Size([1, 1, 3, 320, 320])
+            if style_img.shape[2] == 4:  # Check if there are 4 channels
+                style_img = style_img[:, :, :3, :, :]
+            style_feats = self.encoder(style_img, cameras[:, :1, :])  # torch.Size([6, 401, 768]) cameras:torch.Size([1, 6, 16])
+            style_feats = rearrange(style_feats, '(b v) l d -> b (v l) d', b=B)
         # encode images
         image_feats = self.encoder(images, cameras)
         image_feats = rearrange(image_feats, '(b v) l d -> b (v l) d', b=B)
         # transformer generating planes
+        if style_feats is not None:
+            planes = self.transformer([image_feats, style_feats], alpha, style_layers)
+        else:
+            planes = self.transformer([image_feats], alpha, style_layers)
         return planes
+    def forward_synthesizer(self, planes, render_cameras, render_size: int):
+        render_results = self.synthesizer(
+            planes,
+            render_cameras,
+            render_size,
+        )
+        return render_results
     def forward(self, images, cameras, render_cameras, render_size: int):
         # images: [B, V, C_img, H_img, W_img]
                 sample_tex_pose_list.append(tex_pos_one_shape)
             tex_pos = torch.cat(sample_tex_pose_list, dim=0)
+        tex_feat = torch.utils.checkpoint.checkpoint(
+            self.synthesizer.forward_points,
+            planes,
+            tex_pos,
+            use_reentrant=False,
+        )['rgb']
         if hard_mask is not None:
             final_tex_feat = torch.zeros(

src/models/lrm_mesh.py CHANGED Viewed

@@ -17,6 +17,7 @@ import torch
 import torch.nn as nn
 import nvdiffrast.torch as dr
 from einops import rearrange, repeat
 from .encoder.dino_wrapper import DinoWrapper
 from .decoder.transformer import TriplaneTransformer
@@ -74,12 +75,9 @@ class InstantMesh(nn.Module):
             samples_per_ray=rendering_samples_per_ray,
         )
-    def init_flexicubes_geometry(self, device, fovy=50.0, use_renderer=True):
         camera = PerspectiveCamera(fovy=fovy, device=device)
-        if use_renderer:
-            renderer = NeuralRender(device, camera_model=camera)
-        else:
-            renderer = None
         self.geometry = FlexiCubesGeometry(
             grid_res=self.grid_res,
             scale=self.grid_scale,
@@ -88,17 +86,36 @@ class InstantMesh(nn.Module):
             device=device,
         )
-    def forward_planes(self, images, cameras):
         # images: [B, V, C_img, H_img, W_img]
         # cameras: [B, V, 16]
         B = images.shape[0]
         # encode images
         image_feats = self.encoder(images, cameras)
         image_feats = rearrange(image_feats, '(b v) l d -> b (v l) d', b=B)
-        # decode triplanes
-        planes = self.transformer(image_feats)
         return planes

 import torch.nn as nn
 import nvdiffrast.torch as dr
 from einops import rearrange, repeat
+from PIL import Image
 from .encoder.dino_wrapper import DinoWrapper
 from .decoder.transformer import TriplaneTransformer
             samples_per_ray=rendering_samples_per_ray,
         )
+    def init_flexicubes_geometry(self, device, fovy=50.0):
         camera = PerspectiveCamera(fovy=fovy, device=device)
+        renderer = NeuralRender(device, camera_model=camera)
         self.geometry = FlexiCubesGeometry(
             grid_res=self.grid_res,
             scale=self.grid_scale,
             device=device,
         )
+    def forward_planes(self, images, cameras, style, alpha, style_layers):
         # images: [B, V, C_img, H_img, W_img]
         # cameras: [B, V, 16]
         B = images.shape[0]
+        style_feats = None
+        if style is not None:
+            style_img = np.asarray(Image.open(style), dtype=np.float32) / 255.0
+            if style_img.ndim == 2:  # Handle depth image
+                style_img = np.stack([style_img] * 3, axis=-1)
+            style_img = torch.from_numpy(style_img).permute(2, 0, 1).contiguous().float()
+            style_img = torch.nn.functional.interpolate(
+                style_img.unsqueeze(0), size=(320, 320), mode='bilinear', align_corners=False
+            )  # Shape: [1, 3, 320, 320]
+            style_img = style_img.unsqueeze(1)
+            style_img = style_img.to(images.device) # torch.Size([1, 1, 3, 320, 320])
+            if style_img.shape[2] == 4:  # Check if there are 4 channels
+                style_img = style_img[:, :, :3, :, :]
+            style_feats = self.encoder(style_img, cameras[:, :1, :])  # torch.Size([6, 401, 768]) cameras:torch.Size([1, 6, 16])
+            style_feats = rearrange(style_feats, '(b v) l d -> b (v l) d', b=B)
         # encode images
         image_feats = self.encoder(images, cameras)
         image_feats = rearrange(image_feats, '(b v) l d -> b (v l) d', b=B)
+        # transformer generating planes
+        if style_feats is not None:
+            planes = self.transformer([image_feats, style_feats], alpha, style_layers)
+        else:
+            planes = self.transformer([image_feats], alpha, style_layers)
         return planes

src/utils/infer_util.py CHANGED Viewed

@@ -81,4 +81,17 @@ def images_to_video(
         assert frame.min() >= 0 and frame.max() <= 255, \
             f"Frame value out of range: {frame.min()} ~ {frame.max()}"
         frames.append(frame)
-    imageio.mimwrite(output_path, np.stack(frames), fps=fps, quality=10)

         assert frame.min() >= 0 and frame.max() <= 255, \
             f"Frame value out of range: {frame.min()} ~ {frame.max()}"
         frames.append(frame)
+    imageio.mimwrite(output_path, np.stack(frames), fps=fps, quality=10)
+def save_video(
+    frames: torch.Tensor,
+    output_path: str,
+    fps: int = 30,
+) -> None:
+    # images: (N, C, H, W)
+    frames = [(frame.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8) for frame in frames]
+    writer = imageio.get_writer(output_path, fps=fps)
+    for frame in frames:
+        writer.append_data(frame)
+    writer.close()