Spaces:

atalaydenknalbant
/

DINOv3

Running on Zero

App Files Files Community

atalaydenknalbant commited on Aug 15

Commit

5bec47d

verified ·

1 Parent(s): 951e2b3

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -44

app.py CHANGED Viewed

@@ -1,19 +1,22 @@
-import spaces
 import os
 import io
 import time
 import json
-from typing import List, Tuple
 import tempfile
 from uuid import uuid4
 import gradio as gr
 from PIL import Image
 import numpy as np
 import torch
 from transformers import AutoImageProcessor, AutoModel
-# Models
 MODELS = {
     "ViT-B/16 LVD-1689M": "facebook/dinov3-vitb16-pretrain-lvd1689m",
     "ViT-L/16 LVD-1689M": "facebook/dinov3-vitl16-pretrain-lvd1689m",
@@ -22,20 +25,27 @@ MODELS = {
 }
 DEFAULT_MODEL = "ViT-B/16 LVD-1689M"
-# Tokens
-HF_TOKEN = os.getenv("HF_TOKEN", None)  # set in Space Secrets
-def _gpu_duration_single(image: Image.Image, *_, **__) -> int:
-    # simple 120s booking for single image
     return 120
-def _gpu_duration_gallery(images: List[Image.Image], *_, **__) -> int:
-    # 45s per image plus buffer
-    n = max(1, len(images) if images else 1)
-    return min(600, 45 * n + 60)
 def _load(model_id: str):
-    # token works on current Transformers
     processor = AutoImageProcessor.from_pretrained(
         model_id,
         use_fast=True,
@@ -46,27 +56,38 @@ def _load(model_id: str):
         torch_dtype=torch.float16,
         low_cpu_mem_usage=True,
         token=HF_TOKEN if HF_TOKEN else None,
-    ).to("cuda").eval()
     return processor, model
 def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay: bool):
     t0 = time.time()
     processor, model = _load(model_id)
-    # safer move to cuda for BatchFeature
-    inputs = processor(images=image, return_tensors="pt")
-    inputs = {k: v.to("cuda") for k, v in inputs.items()}
     with torch.amp.autocast("cuda", dtype=torch.float16), torch.inference_mode():
-        out = model(**inputs)
     if pooling == "CLS":
         if getattr(out, "pooler_output", None) is not None:
             emb = out.pooler_output[0]
         else:
             emb = out.last_hidden_state[0, 0]
     else:
-        # mean of patch tokens
         if out.last_hidden_state.ndim == 3:
             num_regs = getattr(model.config, "num_register_tokens", 0)
             patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
@@ -76,12 +97,22 @@ def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay:
             emb = feat.mean(dim=(1, 2))
     emb = emb.float().cpu().numpy()
     overlay = None
     if want_overlay and out.last_hidden_state.ndim == 3:
         num_regs = getattr(model.config, "num_register_tokens", 0)
-        patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
-        H = W = inputs.pixel_values.shape[-1] // getattr(model.config, "patch_size", 16)
-        mags = patch_tokens.norm(dim=1).reshape(H, W)
         mags = (mags - mags.min()) / max(1e-8, (mags.max() - mags.min()))
         m = (mags.cpu().numpy() * 255).astype(np.uint8)
         heat = Image.fromarray(m).resize(image.size, Image.BILINEAR).convert("RGB")
@@ -95,47 +126,77 @@ def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay:
     }
     return emb, overlay, meta
 @spaces.GPU(duration=_gpu_duration_single)
 def extract_embedding(image: Image.Image, model_name: str, pooling: str, want_overlay: bool):
     if image is None:
         return None, "[]", {"error": "No image"}, None
     if not torch.cuda.is_available():
-        raise RuntimeError("CUDA not available. Ensure ZeroGPU is active.")
     model_id = MODELS[model_name]
     emb, overlay, meta = _extract_core(image, model_id, pooling, want_overlay)
-    # preview string
     head = ", ".join(f"{x:.4f}" for x in emb[:16])
     preview = f"[{head}{', ...' if emb.size > 16 else ''}]"
-    # write to a temp file and return the path for gr.File
-    tmp_path = os.path.join(tempfile.gettempdir(), f"embedding_{uuid4().hex}.npy")
-    np.save(tmp_path, emb.astype(np.float32), allow_pickle=False)
-    # gr.Image, gr.Textbox, gr.JSON, gr.File
-    return overlay if overlay else image, preview, meta, tmp_path
 @spaces.GPU(duration=_gpu_duration_gallery)
-def batch_similarity(images: List[Image.Image], model_name: str, pooling: str):
-    if not images or len(images) < 2:
         return "Upload at least 2 images", None
     if not torch.cuda.is_available():
-        raise RuntimeError("CUDA not available. Ensure ZeroGPU is selected.")
     model_id = MODELS[model_name]
     embs = []
-    for img in images:
         e, _, _ = _extract_core(img, model_id, pooling, want_overlay=False)
         embs.append(e)
     X = np.vstack(embs).astype(np.float32)
     Xn = X / np.clip(np.linalg.norm(X, axis=1, keepdims=True), 1e-8, None)
     S = Xn @ Xn.T
-    path = "cosine_similarity_matrix.csv"
-    np.savetxt(path, S, delimiter=",", fmt="%.6f")
-    return f"Computed {len(embs)} embeddings", path
 with gr.Blocks() as app:
-    gr.Markdown("# DINOv3 on ZeroGPU")
     with gr.Tab("Single"):
         with gr.Row():
             with gr.Column():
@@ -152,18 +213,32 @@ with gr.Blocks() as app:
         run_btn.click(extract_embedding, [img, model_dd, pooling, overlay], [out_img, preview, meta, download])
     with gr.Tab("Gallery"):
-        gal = gr.Gallery(label="Images", columns=4, height=360, allow_preview=True)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
         pooling2 = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
         go = gr.Button("Compute cosine on GPU")
         status = gr.Textbox(label="Status", interactive=False)
         csv = gr.File(label="cosine_similarity_matrix.csv")
-        go.click(batch_similarity, [gal, model_dd2, pooling2], [status, csv])
-    gr.Markdown(
-        "ViT 7B is gated. Accept the terms on the model page and set `HF_TOKEN` as a Space secret. "
-        "This app always runs on GPU via ZeroGPU decorated calls."
-    )
 if __name__ == "__main__":
     app.queue().launch()

 import os
 import io
 import time
 import json
 import tempfile
 from uuid import uuid4
+from typing import List, Tuple
 import gradio as gr
 from PIL import Image
 import numpy as np
 import torch
 from transformers import AutoImageProcessor, AutoModel
+import spaces
+# ---------------------------
+# Models and config
+# ---------------------------
 MODELS = {
     "ViT-B/16 LVD-1689M": "facebook/dinov3-vitb16-pretrain-lvd1689m",
     "ViT-L/16 LVD-1689M": "facebook/dinov3-vitl16-pretrain-lvd1689m",
 }
 DEFAULT_MODEL = "ViT-B/16 LVD-1689M"
+HF_TOKEN = os.getenv("HF_TOKEN", None)  # set in Space Secrets after requesting gated access
+# ---------------------------
+# ZeroGPU booking helpers
+# ---------------------------
+def _gpu_duration_single(image: Image.Image, *_args, **_kwargs) -> int:
+    # 120 seconds is plenty for single image feature extraction
     return 120
+def _gpu_duration_gallery(files: List[str], *_args, **_kwargs) -> int:
+    # 35s per image + 30s buffer capped at 10 minutes
+    n = max(1, len(files) if files else 1)
+    return min(600, 35 * n + 30)
+# ---------------------------
+# Model loading and core logic
+# ---------------------------
 def _load(model_id: str):
+    # Use token for gated checkpoints
     processor = AutoImageProcessor.from_pretrained(
         model_id,
         use_fast=True,
         torch_dtype=torch.float16,
         low_cpu_mem_usage=True,
         token=HF_TOKEN if HF_TOKEN else None,
+    )
+    model.to("cuda").eval()
     return processor, model
 def _extract_core(image: Image.Image, model_id: str, pooling: str, want_overlay: bool):
+    """
+    Returns (embedding np.ndarray, optional overlay PIL.Image, meta dict)
+    """
     t0 = time.time()
     processor, model = _load(model_id)
+    # Keep BatchFeature when possible, but handle dict too
+    bf = processor(images=image, return_tensors="pt")
+    if hasattr(bf, "to"):
+        bf = bf.to("cuda")
+        pixel_values = bf["pixel_values"]
+    else:
+        bf = {k: v.to("cuda") for k, v in bf.items()}
+        pixel_values = bf["pixel_values"]
     with torch.amp.autocast("cuda", dtype=torch.float16), torch.inference_mode():
+        out = model(**bf)
+    # Embedding pooling
     if pooling == "CLS":
         if getattr(out, "pooler_output", None) is not None:
             emb = out.pooler_output[0]
         else:
             emb = out.last_hidden_state[0, 0]
     else:
+        # mean of patch tokens or mean over H,W for conv features
         if out.last_hidden_state.ndim == 3:
             num_regs = getattr(model.config, "num_register_tokens", 0)
             patch_tokens = out.last_hidden_state[0, 1 + num_regs :]
             emb = feat.mean(dim=(1, 2))
     emb = emb.float().cpu().numpy()
+    # Optional simple heat overlay for ViT
     overlay = None
     if want_overlay and out.last_hidden_state.ndim == 3:
         num_regs = getattr(model.config, "num_register_tokens", 0)
+        patch_tokens = out.last_hidden_state[0, 1 + num_regs :]   # [N_patches, D]
+        num_patches = patch_tokens.shape[0]
+        # Prefer square grid from token count, else fall back to pixel/patch size
+        h = int(num_patches ** 0.5)
+        w = h
+        if h * w != num_patches:
+            patch = getattr(model.config, "patch_size", 16)
+            h = int(pixel_values.shape[-2] // patch)
+            w = int(pixel_values.shape[-1] // patch)
+        mags = patch_tokens.norm(dim=1).reshape(h, w)
         mags = (mags - mags.min()) / max(1e-8, (mags.max() - mags.min()))
         m = (mags.cpu().numpy() * 255).astype(np.uint8)
         heat = Image.fromarray(m).resize(image.size, Image.BILINEAR).convert("RGB")
     }
     return emb, overlay, meta
+# ---------------------------
+# Single image API (ZeroGPU)
+# ---------------------------
 @spaces.GPU(duration=_gpu_duration_single)
 def extract_embedding(image: Image.Image, model_name: str, pooling: str, want_overlay: bool):
     if image is None:
         return None, "[]", {"error": "No image"}, None
     if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available. Ensure Space hardware is ZeroGPU.")
     model_id = MODELS[model_name]
     emb, overlay, meta = _extract_core(image, model_id, pooling, want_overlay)
+    # Preview + file save for gr.File
     head = ", ".join(f"{x:.4f}" for x in emb[:16])
     preview = f"[{head}{', ...' if emb.size > 16 else ''}]"
+    out_path = os.path.join(tempfile.gettempdir(), f"embedding_{uuid4().hex}.npy")
+    np.save(out_path, emb.astype(np.float32), allow_pickle=False)
+    # Return: gr.Image, gr.Textbox, gr.JSON, gr.File
+    return overlay if overlay else image, preview, meta, out_path
+# ---------------------------
+# Multi image similarity (ZeroGPU)
+# ---------------------------
+def _open_images_from_paths(paths: List[str]) -> List[Image.Image]:
+    imgs: List[Image.Image] = []
+    for p in paths:
+        try:
+            im = Image.open(p).convert("RGB")
+            imgs.append(im)
+        except Exception:
+            pass
+    return imgs
 @spaces.GPU(duration=_gpu_duration_gallery)
+def batch_similarity(files: List[str], model_name: str, pooling: str):
+    # files is a list of filepaths from gr.Files
+    paths = files or []
+    if len(paths) < 2:
         return "Upload at least 2 images", None
     if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available. Ensure Space hardware is ZeroGPU.")
     model_id = MODELS[model_name]
     embs = []
+    for img in _open_images_from_paths(paths):
         e, _, _ = _extract_core(img, model_id, pooling, want_overlay=False)
         embs.append(e)
+    if len(embs) < 2:
+        return "Failed to read or embed images", None
     X = np.vstack(embs).astype(np.float32)
     Xn = X / np.clip(np.linalg.norm(X, axis=1, keepdims=True), 1e-8, None)
     S = Xn @ Xn.T
+    csv_path = os.path.join(tempfile.gettempdir(), f"cosine_{uuid4().hex}.csv")
+    np.savetxt(csv_path, S, delimiter=",", fmt="%.6f")
+    return f"Computed {len(embs)} embeddings", csv_path
+# ---------------------------
+# UI
+# ---------------------------
 with gr.Blocks() as app:
+    gr.Markdown("# DINOv3 on ZeroGPU — Embeddings and Cosine Similarity")
     with gr.Tab("Single"):
         with gr.Row():
             with gr.Column():
         run_btn.click(extract_embedding, [img, model_dd, pooling, overlay], [out_img, preview, meta, download])
     with gr.Tab("Gallery"):
+        gr.Markdown("Upload multiple images. We compute a cosine similarity matrix on GPU and return a CSV.")
+        # Input as Files so you can multi-upload, plus a Gallery preview
+        files_in = gr.Files(label="Upload images", file_types=["image"], file_count="multiple", type="filepath")
+        gallery_preview = gr.Gallery(label="Preview", columns=4, height=300)
         model_dd2 = gr.Dropdown(choices=list(MODELS.keys()), value=DEFAULT_MODEL, label="Backbone")
         pooling2 = gr.Radio(["CLS", "Mean of patch tokens"], value="CLS", label="Pooling")
         go = gr.Button("Compute cosine on GPU")
         status = gr.Textbox(label="Status", interactive=False)
         csv = gr.File(label="cosine_similarity_matrix.csv")
+        # Simple preview hook
+        def _preview(paths: List[str]):
+            if not paths:
+                return []
+            imgs = []
+            for p in paths:
+                try:
+                    imgs.append(Image.open(p).convert("RGB"))
+                except Exception:
+                    pass
+            return imgs
+        files_in.change(_preview, inputs=files_in, outputs=gallery_preview)
+        go.click(batch_similarity, [files_in, model_dd2, pooling2], [status, csv])
 if __name__ == "__main__":
     app.queue().launch()