lpiccinelli
/

unidepth-v2-vitl14

@@ -1,9 +1,9 @@
 ---
 library_name: UniDepth
 tags:
 - monocular-metric-depth-estimation
 - pytorch_model_hub_mixin
-- model_hub_mixin
 ---
 This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:

 ---
 library_name: UniDepth
 tags:
+- model_hub_mixin
 - monocular-metric-depth-estimation
 - pytorch_model_hub_mixin
 ---
 This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:

config.json CHANGED Viewed

@@ -1,24 +1,49 @@
 {
   "data": {
     "image_shape": [
-      420,
-      560
     ],
-    "shape_constraints": {
-      "patch_size": 14,
-      "pixels_bounds": [
-        1400,
-        2400
-      ],
-      "pixels_bounds_ori": [
-        1400,
-        2400
-      ],
-      "ratio_bounds": [
-        0.66,
-        2.0
-      ]
-    }
   },
   "eps": 1e-06,
   "generic": {
@@ -27,22 +52,52 @@
   },
   "model": {
     "expansion": 4,
     "name": "UniDepthV2",
     "num_heads": 8,
     "pixel_decoder": {
       "depths": [
-        6,
-        0,
-        0
       ],
       "dropout": 0.0,
-      "hidden_dim": 512
     },
     "pixel_encoder": {
       "depths": [
-        21,
-        22,
-        23,
         24
       ],
       "embed_dim": 1024,
@@ -72,18 +127,86 @@
         1024,
         1024
       ],
       "name": "dinov2_vitl14",
       "output_idx": [
-        21,
-        22,
-        23,
         24
       ],
       "patch_size": 14,
       "pretrained": null,
       "stacking_fn": "last",
-      "use_norm": true
     }
   },
-  "training": {}
 }

 {
   "data": {
+    "augmentations": {
+      "blur_p": 0.2,
+      "flip_p": 0.5,
+      "gamma_p": 0.8,
+      "grayscale_p": 0.2,
+      "jitter_p": 0.8,
+      "random_blur": 2.0,
+      "random_gamma": 0.2,
+      "random_jitter": 0.4,
+      "random_scale": 2.0,
+      "shape_constraints": {
+        "height_min": 15,
+        "pixels_max": 200000,
+        "pixels_min": 600000,
+        "ratio_bounds": [
+          0.5,
+          2.5
+        ],
+        "sample": true,
+        "shape_mult": 14,
+        "width_min": 15
+      },
+      "test_context": 1.0
+    },
+    "crop": "garg",
+    "data_root": "datasets",
     "image_shape": [
+      480,
+      640
     ],
+    "normalization": "imagenet",
+    "num_copies": 2,
+    "num_frames": 1,
+    "sampling": {
+      "ETH3D": 1.0,
+      "Waymo": 1.0
+    },
+    "train_datasets": [
+      "ETH3D",
+      "Waymo"
+    ],
+    "val_datasets": [
+      "IBims"
+    ]
   },
   "eps": 1e-06,
   "generic": {
   },
   "model": {
     "expansion": 4,
+    "layer_scale": 1.0,
     "name": "UniDepthV2",
     "num_heads": 8,
     "pixel_decoder": {
       "depths": [
+        2,
+        2,
+        2
       ],
       "dropout": 0.0,
+      "hidden_dim": 512,
+      "kernel_size": 3,
+      "name": "Decoder",
+      "out_dim": 64
     },
     "pixel_encoder": {
+      "cls_token_embed_dims": [
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024
+      ],
       "depths": [
+        6,
+        12,
+        18,
         24
       ],
       "embed_dim": 1024,
         1024,
         1024
       ],
+      "freeze_norm": true,
+      "frozen_stages": 0,
+      "lr": 2e-06,
       "name": "dinov2_vitl14",
+      "num_register_tokens": 0,
       "output_idx": [
+        6,
+        12,
+        18,
         24
       ],
       "patch_size": 14,
       "pretrained": null,
       "stacking_fn": "last",
+      "use_norm": true,
+      "wd": 0.1
     }
   },
+  "training": {
+    "batch_size": 8,
+    "clipping": 1.0,
+    "cycle_beta": false,
+    "drop_path": 0.0,
+    "ema": true,
+    "f16": true,
+    "ld": 1.0,
+    "losses": {
+      "camera": {
+        "alpha": 1.0,
+        "fn": "l2",
+        "gamma": 1.0,
+        "input_fn": "linear",
+        "name": "Regression",
+        "output_fn": "sqrt",
+        "weight": 0.25
+      },
+      "confidence": {
+        "alpha": 1.0,
+        "fn": "l1",
+        "gamma": 1.0,
+        "input_fn": "linear",
+        "name": "Regression",
+        "output_fn": "sqrt",
+        "weight": 0.1
+      },
+      "depth": {
+        "dims": [
+          -2,
+          -1
+        ],
+        "input_fn": "log",
+        "integrated": 0.15,
+        "name": "SILog",
+        "output_fn": "sqrt",
+        "weight": 1.0
+      },
+      "invariance": {
+        "name": "SelfDistill",
+        "output_fn": "sqrt",
+        "weight": 0.1
+      },
+      "ssi": {
+        "input_fn": "log1i",
+        "min_samples": 6,
+        "name": "EdgeGuidedLocalSSI",
+        "output_fn": "sqrt",
+        "use_global": true,
+        "weight": 1.0
+      }
+    },
+    "lr": 0.0001,
+    "lr_final": 1e-06,
+    "lr_warmup": 1.0,
+    "n_iters": 300000,
+    "nsteps_accumulation_gradient": 2,
+    "pretrained": "/srv/beegfs02/scratch/3d_tracking/data/pami2025_unidepth/checkpoints/unidepth-v2-vitl14.pt",
+    "use_checkpoint": false,
+    "validation_interval": 2,
+    "warmup_iters": 75000,
+    "wd": 0.1,
+    "wd_final": 0.1
+  }
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13952af59d28d21d4de5873f8b8bf6d679c5dd031dbf7f9e5818cbeca579f6af
-size 1452916608

 version https://git-lfs.github.com/spec/v1
+oid sha256:ba73d3de735302ccc64a50f1e557122050c4b1893e6060b28dba05d6af3e67c6
+size 1415383604