misc update

Browse files

Files changed (10) hide show

loss.py +0 -11
main_nerf.py +14 -5
nerf/gui.py +3 -3
nerf/network.py +5 -14
nerf/network_grid.py +2 -7
nerf/network_tcnn.py +0 -5
nerf/provider.py +36 -26
nerf/renderer.py +6 -5
nerf/utils.py +5 -5
readme.md +40 -4

loss.py DELETED Viewed

@@ -1,11 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-def mape_loss(pred, target):
-    # pred, target: [B, 1], torch tenspr
-    difference = (pred - target).abs()
-    scale = 1 / (target.abs() + 1e-2)
-    loss = difference * scale
-    return loss.mean()

main_nerf.py CHANGED Viewed

@@ -14,6 +14,7 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--text', default=None, help="text prompt")
     parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --dir_text")
     parser.add_argument('--test', action='store_true', help="test mode")
     parser.add_argument('--workspace', type=str, default='workspace')
     parser.add_argument('--guidance', type=str, default='stable-diffusion', help='choose from [stable-diffusion, clip]')
@@ -37,8 +38,8 @@ if __name__ == '__main__':
     parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
     parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]")
     # rendering resolution in training
-    parser.add_argument('--w', type=int, default=64, help="render width for NeRF in training")
-    parser.add_argument('--h', type=int, default=64, help="render height for NeRF in training")
     ### dataset options
     parser.add_argument('--bound', type=float, default=1, help="assume the scene is bounded in box(-bound, bound)")
@@ -47,6 +48,11 @@ if __name__ == '__main__':
     parser.add_argument('--radius_range', type=float, nargs='*', default=[1.0, 1.5], help="training camera radius range")
     parser.add_argument('--fovy_range', type=float, nargs='*', default=[40, 70], help="training camera fovy range")
     parser.add_argument('--dir_text', action='store_true', help="direction-encode the text prompt, by appending front/side/back/overhead view")
     ### GUI options
     parser.add_argument('--gui', action='store_true', help="start a GUI")
@@ -54,8 +60,8 @@ if __name__ == '__main__':
     parser.add_argument('--H', type=int, default=800, help="GUI height")
     parser.add_argument('--radius', type=float, default=3, help="default GUI camera radius from center")
     parser.add_argument('--fovy', type=float, default=60, help="default GUI camera fovy")
-    parser.add_argument('--light_theta', type=float, default=60, help="default GUI light direction")
-    parser.add_argument('--light_phi', type=float, default=0, help="default GUI light direction")
     parser.add_argument('--max_spp', type=int, default=1, help="GUI rendering max sample per pixel")
     opt = parser.parse_args()
@@ -64,7 +70,10 @@ if __name__ == '__main__':
         opt.fp16 = True
         opt.cuda_ray = True
         opt.dir_text = True
     if opt.backbone == 'vanilla':
         from nerf.network import NeRFNetwork
     elif opt.backbone == 'tcnn':

     parser = argparse.ArgumentParser()
     parser.add_argument('--text', default=None, help="text prompt")
     parser.add_argument('-O', action='store_true', help="equals --fp16 --cuda_ray --dir_text")
+    parser.add_argument('-O2', action='store_true', help="equals --fp16 --dir_text")
     parser.add_argument('--test', action='store_true', help="test mode")
     parser.add_argument('--workspace', type=str, default='workspace')
     parser.add_argument('--guidance', type=str, default='stable-diffusion', help='choose from [stable-diffusion, clip]')
     parser.add_argument('--fp16', action='store_true', help="use amp mixed precision training")
     parser.add_argument('--backbone', type=str, default='grid', help="nerf backbone, choose from [grid, tcnn, vanilla]")
     # rendering resolution in training
+    parser.add_argument('--w', type=int, default=128, help="render width for NeRF in training")
+    parser.add_argument('--h', type=int, default=128, help="render height for NeRF in training")
     ### dataset options
     parser.add_argument('--bound', type=float, default=1, help="assume the scene is bounded in box(-bound, bound)")
     parser.add_argument('--radius_range', type=float, nargs='*', default=[1.0, 1.5], help="training camera radius range")
     parser.add_argument('--fovy_range', type=float, nargs='*', default=[40, 70], help="training camera fovy range")
     parser.add_argument('--dir_text', action='store_true', help="direction-encode the text prompt, by appending front/side/back/overhead view")
+    parser.add_argument('--angle_overhead', type=float, default=30, help="[0, angle_overhead] is the overhead region")
+    parser.add_argument('--angle_front', type=float, default=30, help="[0, angle_front] is the front region, [180, 180+angle_front] the back region, otherwise the side region.")
+    parser.add_argument('--lambda_entropy', type=float, default=1e-4, help="loss scale for alpha entropy")
+    parser.add_argument('--lambda_orient', type=float, default=1e-2, help="loss scale for orientation")
     ### GUI options
     parser.add_argument('--gui', action='store_true', help="start a GUI")
     parser.add_argument('--H', type=int, default=800, help="GUI height")
     parser.add_argument('--radius', type=float, default=3, help="default GUI camera radius from center")
     parser.add_argument('--fovy', type=float, default=60, help="default GUI camera fovy")
+    parser.add_argument('--light_theta', type=float, default=60, help="default GUI light direction in [0, 180], corresponding to elevation [90, -90]")
+    parser.add_argument('--light_phi', type=float, default=0, help="default GUI light direction in [0, 360), azimuth")
     parser.add_argument('--max_spp', type=int, default=1, help="GUI rendering max sample per pixel")
     opt = parser.parse_args()
         opt.fp16 = True
         opt.cuda_ray = True
         opt.dir_text = True
+    elif opt.O2:
+        opt.fp16 = True
+        opt.dir_text = True
     if opt.backbone == 'vanilla':
         from nerf.network import NeRFNetwork
     elif opt.backbone == 'tcnn':

nerf/gui.py CHANGED Viewed

@@ -34,14 +34,14 @@ class OrbitCamera:
     # intrinsics
     @property
     def intrinsics(self):
-        focal = self.H / (2 * np.tan(np.radians(self.fovy) / 2))
         return np.array([focal, focal, self.W // 2, self.H // 2])
     def orbit(self, dx, dy):
         # rotate along camera up/side axis!
         side = self.rot.as_matrix()[:3, 0] # why this is side --> ? # already normalized.
-        rotvec_x = self.up * np.radians(-0.1 * dx)
-        rotvec_y = side * np.radians(-0.1 * dy)
         self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot
     def scale(self, delta):

     # intrinsics
     @property
     def intrinsics(self):
+        focal = self.H / (2 * np.tan(np.deg2rad(self.fovy) / 2))
         return np.array([focal, focal, self.W // 2, self.H // 2])
     def orbit(self, dx, dy):
         # rotate along camera up/side axis!
         side = self.rot.as_matrix()[:3, 0] # why this is side --> ? # already normalized.
+        rotvec_x = self.up * np.deg2rad(-0.1 * dx)
+        rotvec_y = side * np.deg2rad(-0.1 * dy)
         self.rot = R.from_rotvec(rotvec_x) * R.from_rotvec(rotvec_y) * self.rot
     def scale(self, delta):

nerf/network.py CHANGED Viewed

@@ -37,26 +37,22 @@ class NeRFNetwork(NeRFRenderer):
                  opt,
                  num_layers=5,
                  hidden_dim=128,
-                 num_layers_bg=3,
-                 hidden_dim_bg=128,
                  ):
         super().__init__(opt)
         self.num_layers = num_layers
         self.hidden_dim = hidden_dim
         self.encoder, self.in_dim = get_encoder('frequency', input_dim=3)
         self.sigma_net = MLP(self.in_dim, 4, hidden_dim, num_layers, bias=True)
         # background network
         if self.bg_radius > 0:
             self.num_layers_bg = num_layers_bg
             self.hidden_dim_bg = hidden_dim_bg
-            self.encoder_bg, self.in_dim_bg = get_encoder('tiledgrid', input_dim=2)
             self.bg_net = MLP(self.in_dim_bg, 3, hidden_dim_bg, num_layers_bg, bias=True)
         else:
@@ -84,7 +80,7 @@ class NeRFNetwork(NeRFRenderer):
         return sigma, albedo
     # ref: https://github.com/zhaofuq/Instant-NSR/blob/main/nerf/network_sdf.py#L192
-    def finite_differnce_normal(self, x, epsilon=5e-4):
         # x: [N, 3]
         dx_pos, _ = self.common_forward((x + torch.tensor([[epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
         dx_neg, _ = self.common_forward((x + torch.tensor([[-epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
@@ -116,7 +112,7 @@ class NeRFNetwork(NeRFRenderer):
             # query normal
             # sigma, albedo = self.common_forward(x)
-            # normal = self.finite_differnce_normal(x)
             with torch.enable_grad():
                 x.requires_grad_(True)
@@ -128,11 +124,6 @@ class NeRFNetwork(NeRFRenderer):
             normal = safe_normalize(normal)
             normal[torch.isnan(normal)] = 0
-            # light direction (random if not provided)
-            if l is None:
-                l = torch.randn(3, device=x.device, dtype=torch.float)
-                l = safe_normalize(l)
             # lambertian shading
             lambertian = ratio + (1 - ratio) * (normal @ -l).clamp(min=0) # [N,]

                  opt,
                  num_layers=5,
                  hidden_dim=128,
+                 num_layers_bg=2,
+                 hidden_dim_bg=64,
                  ):
         super().__init__(opt)
         self.num_layers = num_layers
         self.hidden_dim = hidden_dim
         self.encoder, self.in_dim = get_encoder('frequency', input_dim=3)
         self.sigma_net = MLP(self.in_dim, 4, hidden_dim, num_layers, bias=True)
         # background network
         if self.bg_radius > 0:
             self.num_layers_bg = num_layers_bg
             self.hidden_dim_bg = hidden_dim_bg
+            self.encoder_bg, self.in_dim_bg = get_encoder('frequency', input_dim=2)
             self.bg_net = MLP(self.in_dim_bg, 3, hidden_dim_bg, num_layers_bg, bias=True)
         else:
         return sigma, albedo
     # ref: https://github.com/zhaofuq/Instant-NSR/blob/main/nerf/network_sdf.py#L192
+    def finite_difference_normal(self, x, epsilon=5e-4):
         # x: [N, 3]
         dx_pos, _ = self.common_forward((x + torch.tensor([[epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
         dx_neg, _ = self.common_forward((x + torch.tensor([[-epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
             # query normal
             # sigma, albedo = self.common_forward(x)
+            # normal = self.finite_difference_normal(x)
             with torch.enable_grad():
                 x.requires_grad_(True)
             normal = safe_normalize(normal)
             normal[torch.isnan(normal)] = 0
             # lambertian shading
             lambertian = ratio + (1 - ratio) * (normal @ -l).clamp(min=0) # [N,]

nerf/network_grid.py CHANGED Viewed

@@ -87,7 +87,7 @@ class NeRFNetwork(NeRFRenderer):
         return sigma, albedo
     # ref: https://github.com/zhaofuq/Instant-NSR/blob/main/nerf/network_sdf.py#L192
-    def finite_differnce_normal(self, x, epsilon=5e-4):
         # x: [N, 3]
         dx_pos, _ = self.common_forward((x + torch.tensor([[epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
         dx_neg, _ = self.common_forward((x + torch.tensor([[-epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
@@ -119,7 +119,7 @@ class NeRFNetwork(NeRFRenderer):
             # query normal
             sigma, albedo = self.common_forward(x)
-            normal = self.finite_differnce_normal(x)
             # with torch.enable_grad():
             #     x.requires_grad_(True)
@@ -131,11 +131,6 @@ class NeRFNetwork(NeRFRenderer):
             normal = safe_normalize(normal)
             normal[torch.isnan(normal)] = 0
-            # light direction (random if not provided)
-            if l is None:
-                l = torch.randn(3, device=x.device, dtype=torch.float)
-                l = safe_normalize(l)
             # lambertian shading
             lambertian = ratio + (1 - ratio) * (normal @ -l).clamp(min=0) # [N,]

         return sigma, albedo
     # ref: https://github.com/zhaofuq/Instant-NSR/blob/main/nerf/network_sdf.py#L192
+    def finite_difference_normal(self, x, epsilon=5e-4):
         # x: [N, 3]
         dx_pos, _ = self.common_forward((x + torch.tensor([[epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
         dx_neg, _ = self.common_forward((x + torch.tensor([[-epsilon, 0.00, 0.00]], device=x.device)).clamp(-self.bound, self.bound))
             # query normal
             sigma, albedo = self.common_forward(x)
+            normal = self.finite_difference_normal(x)
             # with torch.enable_grad():
             #     x.requires_grad_(True)
             normal = safe_normalize(normal)
             normal[torch.isnan(normal)] = 0
             # lambertian shading
             lambertian = ratio + (1 - ratio) * (normal @ -l).clamp(min=0) # [N,]

nerf/network_tcnn.py CHANGED Viewed

@@ -133,11 +133,6 @@ class NeRFNetwork(NeRFRenderer):
             if not has_grad:
                 normal = normal.detach()
-            # light direction (random if not provided)
-            if l is None:
-                l = torch.randn(3, device=x.device, dtype=torch.float)
-                l = l / (torch.norm(l, dim=-1, keepdim=True) + 1e-9)
             # lambertian shading
             lambertian = ratio + (1 - ratio) * (normal @ l).clamp(min=0) # [N,]

             if not has_grad:
                 normal = normal.detach()
             # lambertian shading
             lambertian = ratio + (1 - ratio) * (normal @ l).clamp(min=0) # [N,]

nerf/provider.py CHANGED Viewed

@@ -35,37 +35,42 @@ def visualize_poses(poses, size=0.1):
     trimesh.Scene(objects).show()
-def get_view_direction(thetas, phis):
-    #                   phis [B,]; thetas: [B,]
-    # front = 0         0-90
-    # side (left) = 1   90-180
-    # back = 2          180-270
-    # side (right) = 3  270-360
-    # top = 4                        0-30
-    # bottom = 5                     150-180
-    res = torch.zeros(phis.shape[0], dtype=torch.long)
     # first determine by phis
-    res[(phis < (np.pi / 2))] = 0
-    res[(phis >= (np.pi / 2)) & (phis < np.pi)] = 1
-    res[(phis >= np.pi) & (phis < (3 * np.pi / 2))] = 2
-    res[(phis >= (3 * np.pi / 2)) & (phis < (2 * np.pi))] = 3
     # override by thetas
-    res[thetas < (np.pi / 6)] = 4
-    res[thetas >= (5 * np.pi / 6)] = 5
     return res
-def rand_poses(size, device, return_dirs=False, radius_range=[1, 1.5], theta_range=[0, 4 * np.pi / 6], phi_range=[0, 2*np.pi]):
     ''' generate random poses from an orbit camera
     Args:
         size: batch size of generated poses.
         device: where to allocate the output.
         radius: camera radius
-        theta_range: [min, max], should be in [0, \pi]
-        phi_range: [min, max], should be in [0, 2\pi]
     Return:
         poses: [size, 4, 4]
     '''
     radius = torch.rand(size, device=device) * (radius_range[1] - radius_range[0]) + radius_range[0]
     thetas = torch.rand(size, device=device) * (theta_range[1] - theta_range[0]) + theta_range[0]
@@ -94,14 +99,19 @@ def rand_poses(size, device, return_dirs=False, radius_range=[1, 1.5], theta_ran
     poses[:, :3, 3] = centers
     if return_dirs:
-        dirs = get_view_direction(thetas, phis)
     else:
         dirs = None
     return poses, dirs
-def circle_poses(device, return_dirs=False, radius=1.25, theta=np.pi/2, phi=0):
     thetas = torch.FloatTensor([theta]).to(device)
     phis = torch.FloatTensor([phi]).to(device)
@@ -123,7 +133,7 @@ def circle_poses(device, return_dirs=False, radius=1.25, theta=np.pi/2, phi=0):
     poses[:, :3, 3] = centers
     if return_dirs:
-        dirs = get_view_direction(thetas, phis)
     else:
         dirs = None
@@ -160,20 +170,20 @@ class NeRFDataset:
         if self.training:
             # random pose on the fly
-            poses, dirs = rand_poses(B, self.device, return_dirs=self.opt.dir_text, radius_range=self.radius_range)
             # random focal
             fov = random.random() * (self.fovy_range[1] - self.fovy_range[0]) + self.fovy_range[0]
-            focal = self.H / (2 * np.tan(np.radians(fov) / 2))
             intrinsics = np.array([focal, focal, self.cx, self.cy])
         else:
             # circle pose
-            phi = (index[0] / self.size) * 2 * np.pi
-            poses, dirs = circle_poses(self.device, return_dirs=self.opt.dir_text, radius=self.radius_range[1] * 1.2, theta=np.pi/3, phi=phi)
             # fixed focal
             fov = (self.fovy_range[1] + self.fovy_range[0]) / 2
-            focal = self.H / (2 * np.tan(np.radians(fov) / 2))
             intrinsics = np.array([focal, focal, self.cx, self.cy])

     trimesh.Scene(objects).show()
+def get_view_direction(thetas, phis, overhead, front):
+    #                   phis [B,];          thetas: [B,]
+    # front = 0         [0, front)
+    # side (left) = 1   [front, 180)
+    # back = 2          [180, 180+front)
+    # side (right) = 3  [180+front, 360)
+    # top = 4                               [0, overhead]
+    # bottom = 5                            [180-overhead, 180]
+    res = torch.zeros(thetas.shape[0], dtype=torch.long)
     # first determine by phis
+    res[(phis < front)] = 0
+    res[(phis >= front) & (phis < np.pi)] = 1
+    res[(phis >= np.pi) & (phis < (np.pi + front))] = 2
+    res[(phis >= (np.pi + front))] = 3
     # override by thetas
+    res[thetas <= overhead] = 4
+    res[thetas >= (np.pi - overhead)] = 5
     return res
+def rand_poses(size, device, radius_range=[1, 1.5], theta_range=[0, 150], phi_range=[0, 360], return_dirs=False, angle_overhead=30, angle_front=60):
     ''' generate random poses from an orbit camera
     Args:
         size: batch size of generated poses.
         device: where to allocate the output.
         radius: camera radius
+        theta_range: [min, max], should be in [0, pi]
+        phi_range: [min, max], should be in [0, 2 * pi]
     Return:
         poses: [size, 4, 4]
     '''
+    theta_range = np.deg2rad(theta_range)
+    phi_range = np.deg2rad(phi_range)
+    angle_overhead = np.deg2rad(angle_overhead)
+    angle_front = np.deg2rad(angle_front)
     radius = torch.rand(size, device=device) * (radius_range[1] - radius_range[0]) + radius_range[0]
     thetas = torch.rand(size, device=device) * (theta_range[1] - theta_range[0]) + theta_range[0]
     poses[:, :3, 3] = centers
     if return_dirs:
+        dirs = get_view_direction(thetas, phis, angle_overhead, angle_front)
     else:
         dirs = None
     return poses, dirs
+def circle_poses(device, radius=1.25, theta=60, phi=0, return_dirs=False, angle_overhead=30, angle_front=60):
+    theta = np.deg2rad(theta)
+    phi = np.deg2rad(phi)
+    angle_overhead = np.deg2rad(angle_overhead)
+    angle_front = np.deg2rad(angle_front)
     thetas = torch.FloatTensor([theta]).to(device)
     phis = torch.FloatTensor([phi]).to(device)
     poses[:, :3, 3] = centers
     if return_dirs:
+        dirs = get_view_direction(thetas, phis, angle_overhead, angle_front)
     else:
         dirs = None
         if self.training:
             # random pose on the fly
+            poses, dirs = rand_poses(B, self.device, radius_range=self.radius_range, return_dirs=self.opt.dir_text, angle_overhead=self.opt.angle_overhead, angle_front=self.opt.angle_front)
             # random focal
             fov = random.random() * (self.fovy_range[1] - self.fovy_range[0]) + self.fovy_range[0]
+            focal = self.H / (2 * np.tan(np.deg2rad(fov) / 2))
             intrinsics = np.array([focal, focal, self.cx, self.cy])
         else:
             # circle pose
+            phi = (index[0] / self.size) * 360
+            poses, dirs = circle_poses(self.device, radius=self.radius_range[1] * 1.2, theta=60, phi=phi, return_dirs=self.opt.dir_text, angle_overhead=self.opt.angle_overhead, angle_front=self.opt.angle_front)
             # fixed focal
             fov = (self.fovy_range[1] + self.fovy_range[0]) / 2
+            focal = self.H / (2 * np.tan(np.deg2rad(fov) / 2))
             intrinsics = np.array([focal, focal, self.cx, self.cy])

nerf/renderer.py CHANGED Viewed

@@ -448,6 +448,12 @@ class NeRFRenderer(nn.Module):
         # pre-calculate near far
         nears, fars = raymarching.near_far_from_aabb(rays_o, rays_d, self.aabb_train if self.training else self.aabb_infer)
         results = {}
         if self.training:
@@ -476,11 +482,6 @@ class NeRFRenderer(nn.Module):
             # allocate outputs
             dtype = torch.float32
-            # fix light for all samples if not provided
-            if light_d is None:
-                light_d = torch.randn(3, device=device, dtype=torch.float)
-                light_d = safe_normalize(light_d)
             weights_sum = torch.zeros(N, dtype=dtype, device=device)
             depth = torch.zeros(N, dtype=dtype, device=device)

         # pre-calculate near far
         nears, fars = raymarching.near_far_from_aabb(rays_o, rays_d, self.aabb_train if self.training else self.aabb_infer)
+        # random sample light_d if not provided
+        if light_d is None:
+            # gaussian noise around the ray origin, so the light always face the view dir (avoid dark face)
+            light_d = - (rays_o[0] + torch.randn(3, device=device, dtype=torch.float))
+            light_d = safe_normalize(light_d)
         results = {}
         if self.training:
             # allocate outputs
             dtype = torch.float32
             weights_sum = torch.zeros(N, dtype=dtype, device=device)
             depth = torch.zeros(N, dtype=dtype, device=device)

nerf/utils.py CHANGED Viewed

@@ -365,11 +365,11 @@ class Trainer(object):
         # alphas = alphas ** 2 # skewed entropy, favors 0 over 1
         loss_entropy = (- alphas * torch.log2(alphas) - (1 - alphas) * torch.log2(1 - alphas)).mean()
-        loss = loss_guidance + 1e-3 * loss_entropy
         if 'loss_orient' in outputs:
             loss_orient = outputs['loss_orient']
-            loss = loss + 1e-2 * loss_orient
         return pred_rgb, pred_ws, loss
@@ -398,7 +398,7 @@ class Trainer(object):
         # alphas = alphas ** 2 # skewed entropy, favors 0 over 1
         loss_entropy = (- alphas * torch.log2(alphas) - (1 - alphas) * torch.log2(1 - alphas)).mean()
-        loss = 1e-3 * loss_entropy
         return pred_rgb, pred_depth, loss
@@ -638,7 +638,7 @@ class Trainer(object):
         return outputs
     def train_one_epoch(self, loader):
-        self.log(f"==> Start Training Epoch {self.epoch}, lr={self.optimizer.param_groups[0]['lr']:.6f} ...")
         total_loss = 0
         if self.local_rank == 0 and self.report_metric_at_train:
@@ -722,7 +722,7 @@ class Trainer(object):
     def evaluate_one_epoch(self, loader, name=None):
-        self.log(f"++> Evaluate at epoch {self.epoch} ...")
         if name is None:
             name = f'{self.name}_ep{self.epoch:04d}'

         # alphas = alphas ** 2 # skewed entropy, favors 0 over 1
         loss_entropy = (- alphas * torch.log2(alphas) - (1 - alphas) * torch.log2(1 - alphas)).mean()
+        loss = loss_guidance + self.opt.lambda_entropy * loss_entropy
         if 'loss_orient' in outputs:
             loss_orient = outputs['loss_orient']
+            loss = loss + self.opt.lambda_orient * loss_orient
         return pred_rgb, pred_ws, loss
         # alphas = alphas ** 2 # skewed entropy, favors 0 over 1
         loss_entropy = (- alphas * torch.log2(alphas) - (1 - alphas) * torch.log2(1 - alphas)).mean()
+        loss = self.opt.lambda_entropy * loss_entropy
         return pred_rgb, pred_depth, loss
         return outputs
     def train_one_epoch(self, loader):
+        self.log(f"==> Start Training {self.workspace} Epoch {self.epoch}, lr={self.optimizer.param_groups[0]['lr']:.6f} ...")
         total_loss = 0
         if self.local_rank == 0 and self.report_metric_at_train:
     def evaluate_one_epoch(self, loader, name=None):
+        self.log(f"++> Evaluate {self.workspace} at epoch {self.epoch} ...")
         if name is None:
             name = f'{self.name}_ep{self.epoch:04d}'

readme.md CHANGED Viewed

@@ -4,14 +4,14 @@ A pytorch implementation of the text-to-3D model **Dreamfusion**, powered by the
 The original paper's project page: [_DreamFusion: Text-to-3D using 2D Diffusion_](https://dreamfusion3d.github.io/).
-Examples generated from text prompt `a DSLR photo of a pineapple` viewed with the GUI in real time:
 https://user-images.githubusercontent.com/25863658/194241493-f3e68f78-aefe-479e-a4a8-001424a61b37.mp4
 ### [Gallery](https://github.com/ashawkey/stable-dreamfusion/issues/1) | [Update Logs](assets/update_logs.md)
 # Important Notice
-This project is a **work-in-progress**, and contains lots of differences from the paper. Also, many features are still not implemented now. **The current generation quality cannot match the results from the original paper, and still fail badly for many prompts.**
 ## Notable differences from the paper
@@ -83,7 +83,7 @@ python main_nerf.py --text "a hamburger" --workspace trial_clip -O --guidance cl
 python main_nerf.py --text "a hamburger" --workspace trial_clip -O --test --gui --guidance clip
 ```
-# Code organization
 This is a simple description of the most important implementation details.
 If you are interested in improving this repo, this might be a starting point.
@@ -101,14 +101,50 @@ w = (1 - self.scheduler.alphas_cumprod[t]).to(self.device)
 grad = w * (noise_pred - noise)
 latents.backward(gradient=grad, retain_graph=True)
 ```
-* Other regularizations are in `./nerf/utils.py > Trainer > train_step`.
 * NeRF Rendering core function: `./nerf/renderer.py > NeRFRenderer > run_cuda`.
 # Acknowledgement
 * The amazing original work: [_DreamFusion: Text-to-3D using 2D Diffusion_](https://dreamfusion3d.github.io/).
 * Huge thanks to the [Stable Diffusion](https://github.com/CompVis/stable-diffusion) and the [diffusers](https://github.com/huggingface/diffusers) library.
 * The GUI is developed with [DearPyGui](https://github.com/hoffstadt/DearPyGui).

 The original paper's project page: [_DreamFusion: Text-to-3D using 2D Diffusion_](https://dreamfusion3d.github.io/).
+Examples generated from text prompt `a high quality photo of a pineapple` viewed with the GUI in real time:
 https://user-images.githubusercontent.com/25863658/194241493-f3e68f78-aefe-479e-a4a8-001424a61b37.mp4
 ### [Gallery](https://github.com/ashawkey/stable-dreamfusion/issues/1) | [Update Logs](assets/update_logs.md)
 # Important Notice
+This project is a **work-in-progress**, and contains lots of differences from the paper. Also, many features are still not implemented now. **The current generation quality cannot match the results from the original paper, and many prompts still fail badly!**
 ## Notable differences from the paper
 python main_nerf.py --text "a hamburger" --workspace trial_clip -O --test --gui --guidance clip
 ```
+# Code organization & Advanced tips
 This is a simple description of the most important implementation details.
 If you are interested in improving this repo, this might be a starting point.
 grad = w * (noise_pred - noise)
 latents.backward(gradient=grad, retain_graph=True)
 ```
+* Other regularizations are in `./nerf/utils.py > Trainer > train_step`.
+    * The generation seems quite sensitive to regularizations on weights_sum (alphas for each ray). The original opacity loss tends to make NeRF disappear (zero density everywhere), so we use an entropy loss to replace it for now (encourages alpha to be either 0 or 1).
 * NeRF Rendering core function: `./nerf/renderer.py > NeRFRenderer > run_cuda`.
+* Shading & normal evaluation: `./nerf/network*.py > NeRFNetwork > forward`. Current implementation harms training and is disabled.
+    * use `--albedo_iters 1000` to enable random shading mode after 1000 steps from albedo, lambertian ,and textureless
+    * light direction: current implementation use a plane light source, instead of a point light source...
+* View-dependent prompting: `./nerf/provider.pu > get_view_direction`.
+    * ues `--angle_overhead, --angle_front` to set the border. How to better divide front/back/side regions?
+* Network backbone (`./nerf/network*.py`) can be chosen by the `--backbone` option, but `tcnn` and `vanilla` are not well tested.
+    * the occupancy grid based training acceleration (instant-ngp like) may harm the generation progress, since once a grid cell is marked as empty, rays won't pass it later.
+* Spatial density bias (gaussian density blob): `./nerf/network*.py > NeRFNetwork > gaussian`.
 # Acknowledgement
 * The amazing original work: [_DreamFusion: Text-to-3D using 2D Diffusion_](https://dreamfusion3d.github.io/).
+    ```
+    @article{poole2022dreamfusion,
+        author = {Poole, Ben and Jain, Ajay and Barron, Jonathan T. and Mildenhall, Ben},
+        title = {DreamFusion: Text-to-3D using 2D Diffusion},
+        journal = {arXiv},
+        year = {2022},
+    }
+    ```
 * Huge thanks to the [Stable Diffusion](https://github.com/CompVis/stable-diffusion) and the [diffusers](https://github.com/huggingface/diffusers) library.
+    ```
+    @misc{rombach2021highresolution,
+        title={High-Resolution Image Synthesis with Latent Diffusion Models},
+        author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
+        year={2021},
+        eprint={2112.10752},
+        archivePrefix={arXiv},
+        primaryClass={cs.CV}
+    }
+    @misc{von-platen-etal-2022-diffusers,
+        author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
+        title = {Diffusers: State-of-the-art diffusion models},
+        year = {2022},
+        publisher = {GitHub},
+        journal = {GitHub repository},
+        howpublished = {\url{https://github.com/huggingface/diffusers}}
+    }
+    ```
 * The GUI is developed with [DearPyGui](https://github.com/hoffstadt/DearPyGui).