text-to-3d-2.0

Starting

App Files Files Community

mboss commited on Jan 2

Commit

4d8c3d6

1 Parent(s): 64fccd8

Update inference to latest

Browse files

Files changed (6) hide show

__init__.py +7 -2
gradio_app.py +1 -1
run.py +12 -2
spar3d/models/network.py +5 -2
spar3d/system.py +242 -47
spar3d/utils.py +1 -1

__init__.py CHANGED Viewed

@@ -29,14 +29,19 @@ class SPAR3DLoader:
     @classmethod
     def INPUT_TYPES(cls):
-        return {"required": {}}
-    def load(self):
         device = comfy.model_management.get_torch_device()
         model = SPAR3D.from_pretrained(
             SPAR3D_MODEL_NAME,
             config_name="config.yaml",
             weight_name="model.safetensors",
         )
         model.to(device)
         model.eval()

     @classmethod
     def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "low_vram_mode": ("BOOLEAN", {"default": False}),
+            }
+        }
+    def load(self, low_vram_mode=False):
         device = comfy.model_management.get_torch_device()
         model = SPAR3D.from_pretrained(
             SPAR3D_MODEL_NAME,
             config_name="config.yaml",
             weight_name="model.safetensors",
+            low_vram_mode=low_vram_mode,
         )
         model.to(device)
         model.eval()

gradio_app.py CHANGED Viewed

@@ -148,7 +148,7 @@ def run_model(
     start = time.time()
     with torch.no_grad():
         with (
-            torch.autocast(device_type=device, dtype=torch.float16)
             if "cuda" in device
             else nullcontext()
         ):

     start = time.time()
     with torch.no_grad():
         with (
+            torch.autocast(device_type=device, dtype=torch.bfloat16)
             if "cuda" in device
             else nullcontext()
         ):

run.py CHANGED Viewed

@@ -54,6 +54,15 @@ if __name__ == "__main__":
         type=int,
         help="Texture atlas resolution. Default: 1024",
     )
     remesh_choices = ["none"]
     if TRIANGLE_REMESH_AVAILABLE:
@@ -102,6 +111,7 @@ if __name__ == "__main__":
         args.pretrained_model,
         config_name="config.yaml",
         weight_name="model.safetensors",
     )
     model.to(device)
     model.eval()
@@ -149,7 +159,7 @@ if __name__ == "__main__":
             torch.cuda.reset_peak_memory_stats()
         with torch.no_grad():
             with (
-                torch.autocast(device_type=device, dtype=torch.float16)
                 if "cuda" in device
                 else nullcontext()
             ):
@@ -157,7 +167,7 @@ if __name__ == "__main__":
                     image,
                     bake_resolution=args.texture_resolution,
                     remesh=args.remesh_option,
-                    vertex_count=args.target_vertex_count,
                     return_points=True,
                 )
         if torch.cuda.is_available():

         type=int,
         help="Texture atlas resolution. Default: 1024",
     )
+    parser.add_argument(
+        "--low-vram-mode",
+        action="store_true",
+        help=(
+            "Use low VRAM mode. SPAR3D consumes 10.5GB of VRAM by default. "
+            "This mode will reduce the VRAM consumption to roughly 7GB but in exchange "
+            "the model will be slower. Default: False"
+        ),
+    )
     remesh_choices = ["none"]
     if TRIANGLE_REMESH_AVAILABLE:
         args.pretrained_model,
         config_name="config.yaml",
         weight_name="model.safetensors",
+        low_vram_mode=args.low_vram_mode,
     )
     model.to(device)
     model.eval()
             torch.cuda.reset_peak_memory_stats()
         with torch.no_grad():
             with (
+                torch.autocast(device_type=device, dtype=torch.bfloat16)
                 if "cuda" in device
                 else nullcontext()
             ):
                     image,
                     bake_resolution=args.texture_resolution,
                     remesh=args.remesh_option,
+                    vertex_count=vertex_count,
                     return_points=True,
                 )
         if torch.cuda.is_available():

spar3d/models/network.py CHANGED Viewed

@@ -7,8 +7,8 @@ import torch.nn.functional as F
 from einops import rearrange
 from jaxtyping import Float
 from torch import Tensor
 from torch.autograd import Function
-from torch.cuda.amp import custom_bwd, custom_fwd
 from spar3d.models.utils import BaseModule, normalize
 from spar3d.utils import get_device
@@ -79,7 +79,10 @@ class _TruncExp(Function):  # pylint: disable=abstract-method
     # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
     @staticmethod
     @conditional_decorator(
-        custom_fwd, "cuda" in get_device(), cast_inputs=torch.float32
     )
     def forward(ctx, x):  # pylint: disable=arguments-differ
         ctx.save_for_backward(x)

 from einops import rearrange
 from jaxtyping import Float
 from torch import Tensor
+from torch.amp import custom_bwd, custom_fwd
 from torch.autograd import Function
 from spar3d.models.utils import BaseModule, normalize
 from spar3d.utils import get_device
     # https://github.com/ashawkey/torch-ngp/blob/93b08a0d4ec1cc6e69d85df7f0acdfb99603b628/activation.py
     @staticmethod
     @conditional_decorator(
+        custom_fwd,
+        "cuda" in get_device(),
+        cast_inputs=torch.float32,
+        device_type="cuda",
     )
     def forward(ctx, x):  # pylint: disable=arguments-differ
         ctx.save_for_backward(x)

spar3d/system.py CHANGED Viewed

@@ -12,7 +12,7 @@ from huggingface_hub import hf_hub_download
 from jaxtyping import Float
 from omegaconf import OmegaConf
 from PIL import Image
-from safetensors.torch import load_model
 from torch import Tensor
 from spar3d.models.diffusion.gaussian_diffusion import (
@@ -115,11 +115,17 @@ class SPAR3D(BaseModule):
         sigma_max: float = 120.0
         s_churn: float = 3.0
     cfg: Config
     @classmethod
     def from_pretrained(
-        cls, pretrained_model_name_or_path: str, config_name: str, weight_name: str
     ):
         base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
         if os.path.isdir(os.path.join(base_dir, pretrained_model_name_or_path)):
@@ -139,8 +145,18 @@ class SPAR3D(BaseModule):
         cfg = OmegaConf.load(config_path)
         OmegaConf.resolve(cfg)
         model = cls(cfg)
-        load_model(model, weight_path, strict=False)
         return model
     @property
@@ -148,39 +164,52 @@ class SPAR3D(BaseModule):
         return next(self.parameters()).device
     def configure(self):
-        self.image_tokenizer = find_class(self.cfg.image_tokenizer_cls)(
-            self.cfg.image_tokenizer
-        )
-        self.point_embedder = find_class(self.cfg.point_embedder_cls)(
-            self.cfg.point_embedder
-        )
-        self.tokenizer = find_class(self.cfg.tokenizer_cls)(self.cfg.tokenizer)
-        self.camera_embedder = find_class(self.cfg.camera_embedder_cls)(
-            self.cfg.camera_embedder
-        )
-        self.backbone = find_class(self.cfg.backbone_cls)(self.cfg.backbone)
-        self.post_processor = find_class(self.cfg.post_processor_cls)(
-            self.cfg.post_processor
-        )
-        self.decoder = find_class(self.cfg.decoder_cls)(self.cfg.decoder)
-        self.image_estimator = find_class(self.cfg.image_estimator_cls)(
-            self.cfg.image_estimator
-        )
-        self.global_estimator = find_class(self.cfg.global_estimator_cls)(
-            self.cfg.global_estimator
-        )
-        # point diffusion modules
-        self.pdiff_image_tokenizer = find_class(self.cfg.pdiff_image_tokenizer_cls)(
-            self.cfg.pdiff_image_tokenizer
-        )
-        self.pdiff_camera_embedder = find_class(self.cfg.pdiff_camera_embedder_cls)(
-            self.cfg.pdiff_camera_embedder
         )
-        self.pdiff_backbone = find_class(self.cfg.pdiff_backbone_cls)(
-            self.cfg.pdiff_backbone
         )
         self.bbox: Float[Tensor, "2 3"]
         self.register_buffer(
             "bbox",
@@ -206,30 +235,151 @@ class SPAR3D(BaseModule):
         self.baker = TextureBaker()
         self.image_processor = ImageProcessor()
-        channel_scales = [self.cfg.scale_factor_xyz] * 3
-        channel_scales += [self.cfg.scale_factor_rgb] * 3
-        channel_biases = [self.cfg.bias_xyz] * 3
-        channel_biases += [self.cfg.bias_rgb] * 3
-        channel_scales = np.array(channel_scales)
-        channel_biases = np.array(channel_biases)
-        betas = get_named_beta_schedule(
-            self.cfg.diffu_sched, self.cfg.train_time_steps, self.cfg.diffu_sched_exp
         )
-        diffusion_kwargs = dict(
-            betas=betas,
-            model_mean_type=self.cfg.mean_type,
-            model_var_type=self.cfg.var_type,
-            channel_scales=channel_scales,
-            channel_biases=channel_biases,
         )
         self.diffusion_spaced = SpacedDiffusion(
             use_timesteps=space_timesteps(
                 self.cfg.train_time_steps,
                 "ddim" + str(self.cfg.inference_time_steps),
             ),
-            **diffusion_kwargs,
         )
         self.sampler = PointCloudSampler(
             model=self.pdiff_backbone,
@@ -243,6 +393,35 @@ class SPAR3D(BaseModule):
             s_churn=self.cfg.s_churn,
         )
     def triplane_to_meshes(
         self, triplanes: Float[Tensor, "B 3 Cp Hp Wp"]
     ) -> list[Mesh]:
@@ -303,6 +482,11 @@ class SPAR3D(BaseModule):
         return out
     def get_scene_codes(self, batch) -> Float[Tensor, "B 3 C H W"]:
         # if batch[rgb_cond] is only one view, add a view dimension
         if len(batch["rgb_cond"].shape) == 4:
             batch["rgb_cond"] = batch["rgb_cond"].unsqueeze(1)
@@ -340,9 +524,15 @@ class SPAR3D(BaseModule):
         direct_codes = self.tokenizer.detokenize(tokens)
         scene_codes = self.post_processor(direct_codes)
         return scene_codes, direct_codes
     def forward_pdiff_cond(self, batch: Dict[str, Any]) -> Dict[str, Any]:
         if len(batch["rgb_cond"].shape) == 4:
             batch["rgb_cond"] = batch["rgb_cond"].unsqueeze(1)
             batch["mask_cond"] = batch["mask_cond"].unsqueeze(1)
@@ -512,6 +702,11 @@ class SPAR3D(BaseModule):
         output_rotation = rotation2 @ rotation
         global_dict = {}
         if self.image_estimator is not None:
             global_dict.update(
                 self.image_estimator(

 from jaxtyping import Float
 from omegaconf import OmegaConf
 from PIL import Image
+from safetensors.torch import load_file, load_model
 from torch import Tensor
 from spar3d.models.diffusion.gaussian_diffusion import (
         sigma_max: float = 120.0
         s_churn: float = 3.0
+        low_vram_mode: bool = False
     cfg: Config
     @classmethod
     def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        config_name: str,
+        weight_name: str,
+        low_vram_mode: bool = False,
     ):
         base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
         if os.path.isdir(os.path.join(base_dir, pretrained_model_name_or_path)):
         cfg = OmegaConf.load(config_path)
         OmegaConf.resolve(cfg)
+        # Add in low_vram_mode to the config
+        if os.environ.get("SPAR3D_LOW_VRAM", "0") == "1" and torch.cuda.is_available():
+            cfg.low_vram_mode = True
+        else:
+            cfg.low_vram_mode = low_vram_mode if torch.cuda.is_available() else False
         model = cls(cfg)
+        if not model.cfg.low_vram_mode:
+            load_model(model, weight_path, strict=False)
+        else:
+            model._state_dict = load_file(weight_path, device="cpu")
         return model
     @property
         return next(self.parameters()).device
     def configure(self):
+        # Initialize all modules as None
+        self.image_tokenizer = None
+        self.point_embedder = None
+        self.tokenizer = None
+        self.camera_embedder = None
+        self.backbone = None
+        self.post_processor = None
+        self.decoder = None
+        self.image_estimator = None
+        self.global_estimator = None
+        self.pdiff_image_tokenizer = None
+        self.pdiff_camera_embedder = None
+        self.pdiff_backbone = None
+        self.diffusion_spaced = None
+        self.sampler = None
+        # Dummy parameter to safe the device placement for dynamic loading
+        self.dummy_param = torch.nn.Parameter(torch.tensor(0.0))
+        channel_scales = [self.cfg.scale_factor_xyz] * 3
+        channel_scales += [self.cfg.scale_factor_rgb] * 3
+        channel_biases = [self.cfg.bias_xyz] * 3
+        channel_biases += [self.cfg.bias_rgb] * 3
+        channel_scales = np.array(channel_scales)
+        channel_biases = np.array(channel_biases)
+        betas = get_named_beta_schedule(
+            self.cfg.diffu_sched, self.cfg.train_time_steps, self.cfg.diffu_sched_exp
         )
+        self.diffusion_kwargs = dict(
+            betas=betas,
+            model_mean_type=self.cfg.mean_type,
+            model_var_type=self.cfg.var_type,
+            channel_scales=channel_scales,
+            channel_biases=channel_biases,
         )
+        self.is_low_vram = self.cfg.low_vram_mode and get_device() == "cuda"
+        # Create CPU shadow copy if in low VRAM mode
+        if not self.is_low_vram:
+            self._load_all_modules()
+        else:
+            print("Loading in low VRAM mode")
         self.bbox: Float[Tensor, "2 3"]
         self.register_buffer(
             "bbox",
         self.baker = TextureBaker()
         self.image_processor = ImageProcessor()
+    def _load_all_modules(self):
+        """Load all modules into memory"""
+        # Load modules to specified device
+        self.image_tokenizer = find_class(self.cfg.image_tokenizer_cls)(
+            self.cfg.image_tokenizer
+        ).to(self.device)
+        self.point_embedder = find_class(self.cfg.point_embedder_cls)(
+            self.cfg.point_embedder
+        ).to(self.device)
+        self.tokenizer = find_class(self.cfg.tokenizer_cls)(self.cfg.tokenizer).to(
+            self.device
+        )
+        self.camera_embedder = find_class(self.cfg.camera_embedder_cls)(
+            self.cfg.camera_embedder
+        ).to(self.device)
+        self.backbone = find_class(self.cfg.backbone_cls)(self.cfg.backbone).to(
+            self.device
+        )
+        self.post_processor = find_class(self.cfg.post_processor_cls)(
+            self.cfg.post_processor
+        ).to(self.device)
+        self.decoder = find_class(self.cfg.decoder_cls)(self.cfg.decoder).to(
+            self.device
+        )
+        self.image_estimator = find_class(self.cfg.image_estimator_cls)(
+            self.cfg.image_estimator
+        ).to(self.device)
+        self.global_estimator = find_class(self.cfg.global_estimator_cls)(
+            self.cfg.global_estimator
+        ).to(self.device)
+        self.pdiff_image_tokenizer = find_class(self.cfg.pdiff_image_tokenizer_cls)(
+            self.cfg.pdiff_image_tokenizer
+        ).to(self.device)
+        self.pdiff_camera_embedder = find_class(self.cfg.pdiff_camera_embedder_cls)(
+            self.cfg.pdiff_camera_embedder
+        ).to(self.device)
+        self.pdiff_backbone = find_class(self.cfg.pdiff_backbone_cls)(
+            self.cfg.pdiff_backbone
+        ).to(self.device)
+        self.diffusion_spaced = SpacedDiffusion(
+            use_timesteps=space_timesteps(
+                self.cfg.train_time_steps,
+                "ddim" + str(self.cfg.inference_time_steps),
+            ),
+            **self.diffusion_kwargs,
+        )
+        self.sampler = PointCloudSampler(
+            model=self.pdiff_backbone,
+            diffusion=self.diffusion_spaced,
+            num_points=512,
+            point_dim=6,
+            guidance_scale=self.cfg.guidance_scale,
+            clip_denoised=True,
+            sigma_min=1e-3,
+            sigma_max=self.cfg.sigma_max,
+            s_churn=self.cfg.s_churn,
         )
+    def _load_main_modules(self):
+        """Load the main processing modules"""
+        if all(
+            [
+                self.image_tokenizer,
+                self.point_embedder,
+                self.tokenizer,
+                self.camera_embedder,
+                self.backbone,
+                self.post_processor,
+                self.decoder,
+            ]
+        ):
+            return  # Main modules already loaded
+        device = next(self.parameters()).device  # Get the current device
+        self.image_tokenizer = find_class(self.cfg.image_tokenizer_cls)(
+            self.cfg.image_tokenizer
+        ).to(device)
+        self.point_embedder = find_class(self.cfg.point_embedder_cls)(
+            self.cfg.point_embedder
+        ).to(device)
+        self.tokenizer = find_class(self.cfg.tokenizer_cls)(self.cfg.tokenizer).to(
+            device
         )
+        self.camera_embedder = find_class(self.cfg.camera_embedder_cls)(
+            self.cfg.camera_embedder
+        ).to(device)
+        self.backbone = find_class(self.cfg.backbone_cls)(self.cfg.backbone).to(device)
+        self.post_processor = find_class(self.cfg.post_processor_cls)(
+            self.cfg.post_processor
+        ).to(device)
+        self.decoder = find_class(self.cfg.decoder_cls)(self.cfg.decoder).to(device)
+        # Restore weights if we have a checkpoint path
+        if hasattr(self, "_state_dict"):
+            self.load_state_dict(self._state_dict, strict=False)
+    def _load_estimator_modules(self):
+        """Load the estimator modules"""
+        if all([self.image_estimator, self.global_estimator]):
+            return  # Estimator modules already loaded
+        device = next(self.parameters()).device  # Get the current device
+        self.image_estimator = find_class(self.cfg.image_estimator_cls)(
+            self.cfg.image_estimator
+        ).to(device)
+        self.global_estimator = find_class(self.cfg.global_estimator_cls)(
+            self.cfg.global_estimator
+        ).to(device)
+        # Restore weights if we have a checkpoint path
+        if hasattr(self, "_state_dict"):
+            self.load_state_dict(self._state_dict, strict=False)
+    def _load_pdiff_modules(self):
+        """Load only the point diffusion modules"""
+        if all(
+            [
+                self.pdiff_image_tokenizer,
+                self.pdiff_camera_embedder,
+                self.pdiff_backbone,
+            ]
+        ):
+            return  # PDiff modules already loaded
+        device = next(self.parameters()).device  # Get the current device
+        self.pdiff_image_tokenizer = find_class(self.cfg.pdiff_image_tokenizer_cls)(
+            self.cfg.pdiff_image_tokenizer
+        ).to(device)
+        self.pdiff_camera_embedder = find_class(self.cfg.pdiff_camera_embedder_cls)(
+            self.cfg.pdiff_camera_embedder
+        ).to(device)
+        self.pdiff_backbone = find_class(self.cfg.pdiff_backbone_cls)(
+            self.cfg.pdiff_backbone
+        ).to(device)
         self.diffusion_spaced = SpacedDiffusion(
             use_timesteps=space_timesteps(
                 self.cfg.train_time_steps,
                 "ddim" + str(self.cfg.inference_time_steps),
             ),
+            **self.diffusion_kwargs,
         )
         self.sampler = PointCloudSampler(
             model=self.pdiff_backbone,
             s_churn=self.cfg.s_churn,
         )
+        # Restore weights if we have a checkpoint path
+        if hasattr(self, "_state_dict"):
+            self.load_state_dict(self._state_dict, strict=False)
+    def _unload_pdiff_modules(self):
+        """Unload point diffusion modules to free memory"""
+        self.pdiff_image_tokenizer = None
+        self.pdiff_camera_embedder = None
+        self.pdiff_backbone = None
+        self.diffusion_spaced = None
+        self.sampler = None
+        torch.cuda.empty_cache()
+    def _unload_main_modules(self):
+        """Unload main processing modules to free memory"""
+        self.image_tokenizer = None
+        self.point_embedder = None
+        self.tokenizer = None
+        self.camera_embedder = None
+        self.backbone = None
+        self.post_processor = None
+        torch.cuda.empty_cache()
+    def _unload_estimator_modules(self):
+        """Unload estimator modules to free memory"""
+        self.image_estimator = None
+        self.global_estimator = None
+        torch.cuda.empty_cache()
     def triplane_to_meshes(
         self, triplanes: Float[Tensor, "B 3 Cp Hp Wp"]
     ) -> list[Mesh]:
         return out
     def get_scene_codes(self, batch) -> Float[Tensor, "B 3 C H W"]:
+        if self.is_low_vram:
+            self._unload_pdiff_modules()
+            self._unload_estimator_modules()
+            self._load_main_modules()
         # if batch[rgb_cond] is only one view, add a view dimension
         if len(batch["rgb_cond"].shape) == 4:
             batch["rgb_cond"] = batch["rgb_cond"].unsqueeze(1)
         direct_codes = self.tokenizer.detokenize(tokens)
         scene_codes = self.post_processor(direct_codes)
         return scene_codes, direct_codes
     def forward_pdiff_cond(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        if self.is_low_vram:
+            self._unload_main_modules()
+            self._unload_estimator_modules()
+            self._load_pdiff_modules()
         if len(batch["rgb_cond"].shape) == 4:
             batch["rgb_cond"] = batch["rgb_cond"].unsqueeze(1)
             batch["mask_cond"] = batch["mask_cond"].unsqueeze(1)
         output_rotation = rotation2 @ rotation
         global_dict = {}
+        if self.is_low_vram:
+            self._unload_pdiff_modules()
+            self._unload_main_modules()
+            self._load_estimator_modules()
         if self.image_estimator is not None:
             global_dict.update(
                 self.image_estimator(

spar3d/utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ import spar3d.models.utils as spar3d_utils
 def get_device():
-    if os.environ.get("SF3D_USE_CPU", "0") == "1":
         return "cpu"
     device = "cpu"

 def get_device():
+    if os.environ.get("SPAR3D_USE_CPU", "0") == "1":
         return "cpu"
     device = "cpu"