"""
Search-TTA demo
"""
# ────────────────────────── imports ───────────────────────────────────
import cv2
import gradio as gr
import torch
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import io
import torchaudio
import spaces # integration with ZeroGPU on hf
from torchvision import transforms
import open_clip
from clip_vision_per_patch_model import CLIPVisionPerPatchModel
from transformers import ClapAudioModelWithProjection
from transformers import ClapProcessor
# ────────────────────────── global config & models ────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# BioCLIP (ground-image & text encoder)
bio_model, _, _ = open_clip.create_model_and_transforms("hf-hub:imageomics/bioclip")
bio_model = bio_model.to(device).eval()
bio_tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")
# Satellite patch encoder CLIP-L-336 per-patch)
sat_model: CLIPVisionPerPatchModel = (
CLIPVisionPerPatchModel.from_pretrained("derektan95/search-tta-sat")
.to(device)
.eval()
)
# Sound CLAP model
sound_model: ClapAudioModelWithProjection = (
ClapAudioModelWithProjection.from_pretrained("derektan95/search-tta-sound")
.to(device)
.eval()
)
sound_processor: ClapProcessor = ClapProcessor.from_pretrained("derektan95/search-tta-sound")
SAMPLE_RATE = 48000
logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
logit_scale = logit_scale.exp()
blur_kernel = (5,5)
# ────────────────────────── transforms (exact spec) ───────────────────
img_transform = transforms.Compose(
[
transforms.Resize((256, 256)),
transforms.CenterCrop((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
),
]
)
imo_transform = transforms.Compose(
[
transforms.Resize((336, 336)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
),
]
)
def get_audio_clap(path_to_audio,format="mp3",padding="repeatpad",truncation="fusion"):
track, sr = torchaudio.load(path_to_audio, format=format) # torchaudio.load(path_to_audio)
track = track.mean(axis=0)
track = torchaudio.functional.resample(track, orig_freq=sr, new_freq=SAMPLE_RATE)
output = sound_processor(audios=track, sampling_rate=SAMPLE_RATE, max_length_s=10, return_tensors="pt",padding=padding,truncation=truncation)
return output
# ────────────────────────── helpers ───────────────────────────────────
@torch.no_grad()
def _encode_ground(img_pil: Image.Image) -> torch.Tensor:
img = img_transform(img_pil).unsqueeze(0).to(device)
img_embeds, *_ = bio_model(img)
return img_embeds
@torch.no_grad()
def _encode_text(text: str) -> torch.Tensor:
toks = bio_tokenizer(text).to(device)
_, txt_embeds, _ = bio_model(text=toks)
return txt_embeds
@torch.no_grad()
def _encode_sat(img_pil: Image.Image) -> torch.Tensor:
imo = imo_transform(img_pil).unsqueeze(0).to(device)
imo_embeds = sat_model(imo)
return imo_embeds
@torch.no_grad()
def _encode_sound(sound) -> torch.Tensor:
processed_sound = get_audio_clap(sound)
for k in processed_sound.keys():
processed_sound[k] = processed_sound[k].to(device)
unnormalized_audio_embeds = sound_model(**processed_sound).audio_embeds
sound_embeds = torch.nn.functional.normalize(unnormalized_audio_embeds, dim=-1)
return sound_embeds
def _similarity_heatmap(query: torch.Tensor, patches: torch.Tensor) -> np.ndarray:
sims = torch.matmul(query, patches.t()) * logit_scale
sims = sims.t().sigmoid()
sims = sims[1:].squeeze() # drop CLS token
side = int(np.sqrt(len(sims)))
sims = sims.reshape(side, side)
return sims.cpu().detach().numpy()
def _array_to_pil(arr: np.ndarray) -> Image.Image:
"""
Render arr with viridis, automatically stretching its own min→max to 0→1
so that the most-similar patches appear yellow.
"""
# Gausian Smoothing
if blur_kernel != (0,0):
arr = cv2.GaussianBlur(arr, blur_kernel, 0)
# --- contrast-stretch to local 0-1 range --------------------------
arr_min, arr_max = float(arr.min()), float(arr.max())
if arr_max - arr_min < 1e-6: # avoid /0 when the heat-map is flat
arr_scaled = np.zeros_like(arr)
else:
arr_scaled = (arr - arr_min) / (arr_max - arr_min)
# ------------------------------------------------------------------
fig, ax = plt.subplots(figsize=(2.6, 2.6), dpi=96)
ax.imshow(arr_scaled, cmap="viridis", vmin=0.0, vmax=1.0)
ax.axis("off")
buf = io.BytesIO()
plt.tight_layout(pad=0)
fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
plt.close(fig)
buf.seek(0)
return Image.open(buf)
# ────────────────────────── main inference ────────────────────────────
# integration with ZeroGPU on hf
@spaces.GPU
def process(
sat_img: Image.Image,
taxonomy: str,
ground_img: Image.Image | None,
sound: torch.Tensor | None,
):
if sat_img is None:
return None, None
patches = _encode_sat(sat_img)
heat_ground, heat_text, heat_sound = None, None, None
if ground_img is not None:
q_img = _encode_ground(ground_img)
heat_ground = _array_to_pil(_similarity_heatmap(q_img, patches))
if taxonomy.strip():
q_txt = _encode_text(taxonomy.strip())
heat_text = _array_to_pil(_similarity_heatmap(q_txt, patches))
if sound is not None:
q_sound = _encode_sound(sound)
heat_sound = _array_to_pil(_similarity_heatmap(q_sound, patches))
return heat_ground, heat_text, heat_sound
# ────────────────────────── Gradio UI ─────────────────────────────────
with gr.Blocks(title="Search-TTA", theme=gr.themes.Base()) as demo:
gr.Markdown(
"""
# Search-TTA: A Multimodal Test-Time Adaptation Framework for Visual Search in the Wild Demo
Project Website
[Work in Progress]
"""
)
# with gr.Row():
# gr.Markdown(
# """
#