Spaces:

derektan95
/

search-tta-demo

Running on Zero

App Files Files Community

derektan commited on Jul 8

Commit

4f09ecf

1 Parent(s): 66c5745

Init new app to handle planning. Fresh import from 27fe831777c12b25e504dd14e5b661742bdecce6 from VLM-Search

Browse files

Files changed (26) hide show

.gitignore +146 -1
Maps/flair_real_maps/envs_val_trained_clss/MSK_000310_great_horned_owl.png +3 -0
Maps/flair_real_maps/masks_val_trained_clss/MSK_000310_great_horned_owl.png +3 -0
Maps/flair_real_maps_lisa_pred/finetuned_LISA_v3_original_losses/flair_lisa_soft_masks_trained_clss_v3/MSK_000310_great_horned_owl.png +3 -0
Taxabind/Taxabind/SatBind/clip_seg_tta.py +933 -0
Taxabind/Taxabind/SatBind/config.py +61 -0
Taxabind/Taxabind/SatBind/dataset.py +396 -0
Taxabind/Taxabind/SatBind/kmeans_clustering.py +620 -0
Taxabind/Taxabind/SatBind/model.py +226 -0
Taxabind/Taxabind/SatBind/watershed_segmentation.py +1094 -0
Taxabind/Taxabind/SoundBind/config.py +30 -0
Taxabind/Taxabind/SoundBind/dataloader.py +57 -0
Taxabind/Taxabind/SoundBind/model.py +141 -0
Taxabind/Taxabind/SoundBind/sound_encoder.py +45 -0
app.py +119 -260
app_BACKUP.py +375 -0
env.py +811 -0
graph.py +179 -0
graph_generator.py +330 -0
inference/model/STAGE1_vlm_search_24x24_040425_no_tgt_rewards_iNAT_DS_16k.pth +3 -0
model.py +319 -0
node.py +102 -0
robot.py +60 -0
sensor.py +128 -0
test_multi_robot_worker.py +727 -0
test_parameter.py +183 -0

.gitignore CHANGED Viewed

	@@ -1 +1,146 @@
1	- **/__pycache__/

+# Additional Stuff
+!train/
+train/*
+!train/saved
+!model/
+model/*
+!model/saved
+!inference/
+inference/*
+!inference/saved
+!gifs/
+gifs/*
+!gifs/saved
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

Maps/flair_real_maps/envs_val_trained_clss/MSK_000310_great_horned_owl.png ADDED Viewed

Git LFS Details

SHA256: 7af11bcef1972b7e047f53b597fef2a332d82c7feceb21aac6e14a57469c436b
Pointer size: 129 Bytes
Size of remote file: 2.34 kB

Maps/flair_real_maps/masks_val_trained_clss/MSK_000310_great_horned_owl.png ADDED Viewed

Git LFS Details

SHA256: 4e2c4a5be42c630a1ea5f65300e9d9a1eeed746be89befd8e1adde17b80d18e5
Pointer size: 129 Bytes
Size of remote file: 6.13 kB

Maps/flair_real_maps_lisa_pred/finetuned_LISA_v3_original_losses/flair_lisa_soft_masks_trained_clss_v3/MSK_000310_great_horned_owl.png ADDED Viewed

Git LFS Details

SHA256: 318773e2c18275d84b5145d7e69836baa0bedd833f44b49f98e6619357677cff
Pointer size: 130 Bytes
Size of remote file: 75.9 kB

Taxabind/Taxabind/SatBind/clip_seg_tta.py ADDED Viewed

	@@ -0,0 +1,933 @@

+from itertools import cycle
+import sys
+import os
+import cv2
+# Ensure the correct directory is at the front of sys.path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+# Remove cached 'model' module to force reloading from the correct path
+if "model" in sys.modules:
+    del sys.modules["model"]
+import json
+import glob
+from datetime import datetime
+import time
+import torch
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from matplotlib import pyplot as plt
+from torch.utils.data import DataLoader
+from torchvision.transforms import v2
+import open_clip
+from dataset import SatNatDataset
+from model import SatBind
+from transformers import CLIPVisionModelWithProjection
+from kmeans_clustering import CombinedSilhouetteInertiaClusterer
+# Just to import SoundBind (1 dir back)
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from SoundBind.model import AudioBind
+# import matplotlib
+# matplotlib.use("Agg")  # <-- key line to avoid tkinter dependency
+class ClipSegTTA:
+    def __init__(
+        self,
+        img_dir: str,
+        imo_dir: str,
+        json_path: str,
+        sat_to_img_ids_json_path: str,
+        patch_size: int,
+        sat_checkpoint_path: str,
+        sample_index: int = 0,
+        blur_kernel = (5,5),    # (0,0) for no gaussian blur
+        batch_size: int = 1,
+        num_workers: int = 1,
+        device: str = "cuda",
+        sat_to_img_ids_json_is_train_dict: bool = True,
+        tax_to_filter_val: str = "",
+        load_model: bool = True,
+        initial_modality: str = "image",
+        sound_data_path: str = None,
+        sound_checkpoint_path: str = None,
+    ):
+        """
+        Initialize the ClipSegTTA class with the required parameters.
+        :param img_dir: Path to the ground images directory.
+        :param imo_dir: Path to the satellite images directory.
+        :param json_path: Path to the iNat JSON file (full dataset).
+        :param sat_filtered_json_path: Path to the filtered pixel-CLIP JSON (satellite).
+        :param sat_to_img_ids_json_path: Path to the mapping from satellite image IDs to ground image IDs.
+        :param patch_size: Size of each satellite patch (e.g. 14).
+        :param sat_checkpoint_path: Path to the SatBind checkpoint (CKPT file).
+        :param batch_size: Batch size for loading data (default=1).
+        :param num_workers: Number of workers for data loading (default=1).
+        :param device: Device to use, e.g., 'cuda' or 'cpu' (default='cuda').
+        """
+        self.img_dir = img_dir
+        self.imo_dir = imo_dir
+        self.json_path = json_path
+        # self.sat_filtered_json_path = sat_filtered_json_path
+        self.sat_to_img_ids_json_path = sat_to_img_ids_json_path
+        self.patch_size = patch_size
+        self.sat_checkpoint_path = sat_checkpoint_path
+        self.sample_index = sample_index    # Can be overriden by reset function
+        self.blur_kernel = blur_kernel
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.device = device
+        self.sat_to_img_ids_json_is_train_dict = sat_to_img_ids_json_is_train_dict
+        self.tax_to_filter_val = tax_to_filter_val
+        self.load_model = load_model
+        self.initial_modality = initial_modality
+        self.sound_data_path = sound_data_path
+        self.sound_checkpoint_path = sound_checkpoint_path
+        # Prepare the dataset
+        start_time = time.time()
+        self.load_data()
+        print(f"Dataset loaded in {(time.time()-start_time):.2f}s.")
+        # Load the global model (original/frozen checkpoint)
+        if self.load_model:
+            start_time = time.time()
+            self.load_global_model()
+            self.tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")
+            print(f"Global model loaded in {(time.time()-start_time):.2f}s.")
+        # Create the local model that will be adapted for TTA
+        if self.load_model:
+            self.model_local = SatBind(train_dataset=None, val_dataset=None)
+            self.model_local.to(self.device)
+            self.model_local.eval()
+            # Load sound model if provided
+            if self.sound_checkpoint_path:
+                self.sound_model = AudioBind.load_from_checkpoint(sound_checkpoint_path, train_dataset=None, val_dataset=None)
+                self.sound_model.to(self.device)
+                self.sound_model.eval()
+        # Params
+        self.reset(sample_idx=self.sample_index)
+        # for idx, related_img_path in enumerate(self.related_imgs_paths):
+        #     self.visualize_heatmap(
+        #         step=0,
+        #         img_path_viz=related_img_path,
+        #         imo_path_viz=self.imo_path,
+        #         patch_idx_viz=[self.patch_idx],
+        #         patch_is_pos=[True],
+        #         species_name=self.species_name
+        #     )
+        self.clip_inference_time = 0.0
+        self.tta_time = 0.0
+    def load_data(self):
+        """Load or initialize the dataset."""
+        self.dataset = SatNatDataset(
+            img_dir=self.img_dir,
+            imo_dir=self.imo_dir,
+            json_path=self.json_path,
+            # sat_filtered_json_path=self.sat_filtered_json_path,
+            sat_to_img_ids_json_path=self.sat_to_img_ids_json_path,
+            sound_data_path=self.sound_data_path,
+            patch_size=self.patch_size,
+            mode="val",
+            get_img_path=True,
+            sat_to_img_ids_json_is_train_dict=self.sat_to_img_ids_json_is_train_dict,
+            tax_to_filter_val=self.tax_to_filter_val
+        )
+    def reset(self, sample_idx):
+        """Reset the parameters & local model for the current sample."""
+        if self.load_model:
+            self.reset_local_model()    # Reset to global weights as init
+        self.img_paths, self.imo_path, self.imgs, self.imo, self.sounds, self.sound_ids, self.species_name, self.target_positions, self.gt_mask_name = self.dataset.get_search_ds_data(sample_idx)
+        self.imgs = self.imgs.to(self.device)
+        # self.img_path = self.img_paths[0]       # Select 1st img as query
+        # self.img = self.imgs[0]                 # Select 1st img as query
+        # self.img = self.img.to(self.device)
+        # self.imo = self.imo.to(self.device)
+        self.tgts_gt_score = None
+        if self.load_model:
+            self.heatmap, self.heatmap_unnormalized, self.heatmap_unnormalized_initial, self.patch_embeds = None, None, None, None
+            img = self.imgs[0].unsqueeze(0).to(self.device)
+            imo = self.imo.unsqueeze(0).to(self.device)
+            sound = self.sounds[0].to(self.device) if self.sounds != [] else None
+            txt = [self.species_name]
+            self.generate_heatmap(img, imo, txt, sound=sound, modality=self.initial_modality)
+            # Find avg heatmap score for target positions (index target into heatmap)
+            scores = []
+            imo_orig = Image.open(self.imo_path)
+            for pos in self.target_positions:
+                row_trans = int(pos[0] * self.heatmap.shape[0] / imo_orig.size[0])
+                col_trans = int(pos[1] * self.heatmap.shape[1] / imo_orig.size[1])
+                # print("row_trans, col_trans: ", row_trans, col_trans)
+                scores.append(self.heatmap[row_trans, col_trans])
+            self.tgts_gt_score = np.mean(scores)
+            # print("scores: ", scores)
+            # print("self.tgts_gt_score: ", self.tgts_gt_score)
+        # print("Sample reset for TTA: ", self.species_name)
+    def load_global_model(self):
+        """Load the global SatBind model from checkpoint, move to device, and eval."""
+        self.model_global = SatBind.load_from_checkpoint(
+            self.sat_checkpoint_path, train_dataset=None, val_dataset=None
+        )
+        self.model_global = self.model_global.to(self.device)
+        self.model_global.eval()
+    def reset_local_model(self):
+        """
+        Reset the local model to match the global model's parameters
+        and freeze/unfreeze layers for TTA.
+        """
+        start_time = time.time()
+        with torch.no_grad():
+            for param_global, param_local in zip(
+                self.model_global.parameters(), self.model_local.parameters()
+            ):
+                param_local.data.copy_(param_global.data)
+        # Freeze everything except the satellite encoder & custom projection
+        for name, param in self.model_local.named_parameters():
+            if "imo_encoder" in name or "visual_projection_custom" in name:
+                param.requires_grad = True
+            else:
+                param.requires_grad = False
+        self.model_local.eval()
+        # print(f"Local model reset in {(time.time()-start_time):.2f}s.")
+    def execute_tta(
+        self,
+        patch_indices: list,
+        patch_is_pos: list,
+        pos_sample_weight: list,
+        neg_sample_weight: list,
+        tta_steps: int = 10,
+        lr: float = 2e-6,
+        modality: str = "image",    # image, text, or combined
+        query_variety: bool = False,    # Whether to use different query images
+        target_found_idxs: list = [],
+        reset_weights: bool = True,
+        num_viz_steps: int = 1,
+        viz_heatmap: bool = False,
+    ):
+        """
+        Run test-time adaptation using the local model. The local model is first
+        reset to the global weights. After TTA, the global model remains
+        unchanged; only the local model is updated.
+        :param sample_index: Index for selecting sample(s) from the validation set.
+        :param tta_steps: Number of test-time adaptation steps.
+        :param num_viz_steps: Visualize heatmap after every 'num_viz_steps' steps.
+        :param viz_heatmap: If True, perform visualization. If False, skip plotting.
+        """
+        ### Option 1: SAMPLE FROM DATASET
+        # 1) Reset the local model to global weights
+        if reset_weights:
+            self.reset_local_model()
+        # 2) Prepare the sample(s) for TTA
+        # print("target_found_idxs: ", target_found_idxs)
+        if query_variety:
+            indices = torch.tensor(target_found_idxs).to(dtype=torch.long).to(self.device)
+            img = self.imgs[indices].to(self.device)
+            # print("~variety")
+        elif (self.initial_modality == "text" or self.initial_modality == "sound") and modality == "combined" and len(target_found_idxs) == 0:
+            indices = torch.tensor(target_found_idxs).to(dtype=torch.long).to(self.device)
+            img = self.imgs[indices].to(self.device)
+            # print("~empty")
+        else:
+            img = self.imgs[0].unsqueeze(0).to(self.device)
+            # print("~single")
+        imo = self.imo.unsqueeze(0).to(self.device)    # vectorize in shared_step to make imo uniform (faster)
+        txt = [self.species_name]
+        sound = self.sounds[0].to(self.device) if self.sounds != [] else None
+        patch_indices = [idx+1 for idx in patch_indices]    # BUGFIX: Consider the [CLS] token offset
+        patch_idx = torch.tensor(patch_indices).to(self.device)
+        # print("img.shape: ", img.shape)
+        # print("imo.shape: ", imo.shape)
+        # print("patch_idx.shape: ", patch_idx.shape)
+        # print("species_txt: ", txt)
+        # ---------------------------------------------------------------------
+        # 5) Set up optimizer (only for sat branch) & AMP scaler
+        optimizer = torch.optim.Adam(
+            [p for p in self.model_local.parameters() if p.requires_grad], lr=lr
+        )
+        # scaler = torch.cuda.amp.GradScaler(init_scale=1000)
+        # print(
+        #     f"Starting TTA (patch_idx={patch_idx[0].item()}) for {tta_steps} steps. "
+        #     f"Visualization: {'ON' if viz_heatmap else 'OFF'} every {num_viz_steps} step(s)."
+        # )
+        start_time = time.time()
+        # 6) TTA loop
+        for step in range(tta_steps):
+            # # with torch.cuda.amp.autocast():
+            batch_size = imo.shape[0]
+            # Query embeds
+            query_embeds = self.generate_query_embeds(img, imo, txt, sound=sound, modality=modality)
+            # Sat Embeds
+            imo_embeds = self.model_local.imo_encoder(imo).last_hidden_state    # (batch, Patches, hidden_dim)
+            imo_embeds = imo_embeds[torch.arange(batch_size), patch_idx]  # (batch, hidden_dim)
+            imo_embeds = self.model_local.visual_projection_custom(imo_embeds)  # (batch_size, proj_dim)
+            imo_embeds = torch.nn.functional.normalize(imo_embeds, dim=-1)
+            # Compute Similarity Loss
+            logit_scale = self.model_local.logit_scale.exp()
+            similarity = imo_embeds @ query_embeds.t() * logit_scale
+            # target = torch.tensor(patch_is_pos, dtype=torch.float32, device=similarity.device)
+            # per_sample_weight = torch.tensor(per_sample_weight, dtype=torch.float32, device=similarity.device)
+            # criterion = torch.nn.BCEWithLogitsLoss(weight=per_sample_weight)  # Includes sigmoid activation internally
+            # loss = criterion(similarity.squeeze(), target)
+            # # NOTE: Equivalent loss scaling
+            # # criterion = torch.nn.BCEWithLogitsLoss(reduction="none")
+            # # loss = criterion(similarity.squeeze(), target)
+            # # loss = loss * per_sample_weight
+            # # loss = loss.mean()
+            # loss = 0
+            # patch_probs = similarity.squeeze()
+            # for probs, counts in zip(patch_probs, patch_is_pos):
+            #     print("patch_probs.shape: ", patch_probs.shape)
+            #     delta_loss = (probs - counts * torch.log(probs + 1e-6))
+            #     print("delta_loss.shape: ", delta_loss.shape)
+            #     loss += delta_loss
+            # Negative Log Likelihood loss for spatial poisson point process
+            patch_probs = similarity.squeeze().sigmoid()
+            counts = torch.tensor(patch_is_pos, dtype=torch.float32, device=similarity.device)
+            pos_weights = torch.tensor(pos_sample_weight, dtype=torch.float32, device=similarity.device)
+            neg_weights = torch.tensor(neg_sample_weight, dtype=torch.float32, device=similarity.device)
+            # loss = (neg_weight * patch_probs - pos_weight * counts * torch.log(patch_probs + 1e-6))
+            loss = (neg_weights * patch_probs - pos_weights * counts * torch.log(patch_probs + 1e-6))
+            # print("patch_probs: ", patch_probs)
+            # print("counts: ", counts)
+            # print("pos_weights: ", pos_weights)
+            # print("neg_weights: ", neg_weights)
+            # print("loss: ", loss)
+            loss = loss.sum()
+            # # Print loss info
+            # print(f"Step {step:03d}: CLIP Loss = {loss.item():.4f}")
+            # # Backprop and update
+            # NOTE: No need for scaler since not using mixed precision (i.e. autocast)
+            # optimizer.zero_grad()
+            # scaler.scale(loss).backward()
+            # scaler.step(optimizer)
+            # scaler.update()
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            self.tta_time = time.time() - start_time
+            # print("self.tta_time: ", self.tta_time)
+            # Visualization every 'num_viz_steps' steps (if enabled)
+            if (step + 1) % num_viz_steps == 0 and viz_heatmap:
+                # Visualize only the first sample in the batch
+                # self.generate_heatmap(img[0], imo[0])
+                self.generate_heatmap(img, imo, txt, sound=sound, modality=modality)
+                self.visualize_heatmap(
+                    step=step,
+                    img_path_viz=self.img_paths[0], # Viz 1st image
+                    imo_path_viz=self.imo_path,
+                    patch_idx_viz=patch_idx,
+                    patch_is_pos=patch_is_pos,
+                    species_name=self.species_name
+                )
+        # Save final heatmap after TTA steps
+        self.generate_heatmap(img, imo, txt, sound=sound, modality=modality)
+    def generate_query_embeds(self, img, imo, txt, sound=None, modality="image"):
+        # Query Embeds
+        if modality == "image":
+            # print("~Image modality")
+            query_embeds, *_ = self.model_local.bio_model(img)    # (batch_size, proj_dim)
+            if query_embeds.shape[0] > 1:
+                query_embeds = query_embeds.mean(dim=0, keepdim=True)  # (1, proj_dim)
+        elif modality == "text" or (modality == "combined" and img.shape[0] == 0):
+            # print("~Text modality")
+            txt_tokenized = self.tokenizer(txt).to(imo.device)
+            _, query_embeds, _ = self.model_local.bio_model(text=txt_tokenized)
+        elif modality == "sound" or (modality == "combined" and img.shape[0] == 0):
+            # print("~Sound modality")
+            if sound == None:
+                print("!!!! Sound modality requires sound input !!!")
+                exit(1)
+            unnormalized_audio_embeds = self.sound_model.audio_encoder(sound)
+            query_embeds = torch.nn.functional.normalize(unnormalized_audio_embeds, dim=-1)
+        elif modality == "combined":
+            # print("~Combined modality")
+            img_embeds, *_ = self.model_local.bio_model(img)
+            if img_embeds.shape[0] > 1:
+                img_embeds = img_embeds.mean(dim=0, keepdim=True)  # (1, proj_dim)
+            txt_tokenized = self.tokenizer(txt).to(imo.device)
+            _, txt_embeds, _ = self.model_local.bio_model(text=txt_tokenized)
+            query_embeds = (img_embeds + txt_embeds) / 2
+        else:
+            raise ValueError("Invalid modality")
+        return query_embeds
+    def generate_heatmap(self, img, imo, txt, sound=None, modality="image"):
+        start_time = time.time()
+        # Satellite encoder outputs
+        imo_embeds = self.model_local.imo_encoder(imo).last_hidden_state
+        # Apply custom projection
+        imo_embeds = self.model_local.visual_projection_custom(imo_embeds)
+        imo_embeds = torch.nn.functional.normalize(imo_embeds, dim=-1)
+        # Remove batch dimension -> (num_tokens, proj_dim)
+        imo_embeds = imo_embeds.squeeze(0)
+        self.patch_embeds = imo_embeds.clone()[1:].cpu().detach().numpy()
+        # # Ground image embedding (bio CLIP model)
+        # img_embeds, *_ = self.model_local.bio_model(img)
+        query_embeds = self.generate_query_embeds(img, imo, txt, sound=sound, modality=modality)
+        # Same logit scale as in SatBind
+        logit_scale = self.model_local.logit_scale.exp()
+        sim = query_embeds @ imo_embeds.t() * logit_scale
+        # Sigmoid to get similarity scores
+        scores = sim.t().sigmoid()  # (num_tokens, 1)
+        # Exclude [CLS] token at index 0
+        score_no_cls = scores[1:].squeeze()  # shape: (num_tokens-1,)
+        num_tokens = score_no_cls.shape[0]
+        side_dim = int(num_tokens**0.5)
+        sim_scores = score_no_cls.reshape(side_dim, side_dim).clone()
+        sim_scores = sim_scores.cpu().detach().numpy()
+        self.clip_inference_time = time.time() - start_time
+        # print("self.clip_inference_time: ", self.clip_inference_time)
+        # Gausian Smoothing
+        if self.blur_kernel != (0,0):
+            sim_scores = cv2.GaussianBlur(sim_scores, self.blur_kernel, 0)
+        # Normalize to expectation
+        self.heatmap_unnormalized = sim_scores
+        scale = len(self.target_positions) / (self.heatmap_unnormalized.sum())
+        self.heatmap_unnormalized *= scale
+        if self.heatmap_unnormalized_initial is None:
+            self.heatmap_unnormalized_initial = self.heatmap_unnormalized.copy()
+        # print("self.heatmap_unnormalized.sum(): ", self.heatmap_unnormalized.sum())
+        # Standard normalization to (0,1)
+        # self.heatmap = score_no_cls.reshape(side_dim, side_dim).clone()
+        # self.heatmap = self.heatmap.cpu().detach().numpy()
+        self.heatmap = sim_scores.copy()
+        self.heatmap = (self.heatmap - self.heatmap.min()) / (self.heatmap.max() - self.heatmap.min())  # normalize heatmap
+    def visualize_heatmap(
+        self,
+        step: int,
+        img_path_viz: str,
+        imo_path_viz: str,
+        patch_idx_viz: torch.Tensor,
+        patch_is_pos: list,
+        species_name: str
+    ):
+        """
+        Visualization function that plots the ground image, satellite image with
+        highlighted patch, and the learned heatmap.
+        :param step: Current TTA step (for labeling the plots).
+        :param img_path_viz: File path to the ground image.
+        :param imo_path_viz: File path to the satellite image.
+        :param patch_idx_viz: The patch index (tensor) being highlighted.
+        :param species_text: List of species text labels (from dataset).
+        :param species_idx_viz: Index of the species to visualize text for.
+        :param img_viz: The ground image tensor (3, H, W).
+        :param imo_viz: The satellite image tensor (3, H, W).
+        """
+        # Switch off gradients for visualization
+        with torch.no_grad():
+            side_dim = self.heatmap.shape[0]
+            # -----------------------------------------------------------------
+            # Highlight the patch in the satellite image
+            sat_img_orig = Image.open(imo_path_viz)
+            sat_highlight = np.array(
+                self.dataset.debug_imo_viz_transform(sat_img_orig.copy())
+            )
+            for idx, patch_idx in enumerate(patch_idx_viz):
+                # Because patch_idx includes the [CLS] offset, subtract 1
+                patch_idx_actual = patch_idx - 1
+                # Get dimensions (H x W)
+                H, W = sat_highlight.shape[0], sat_highlight.shape[1]
+                # Number of patches in each dimension
+                patches_per_col = W // self.patch_size
+                patches_per_row = H // self.patch_size
+                # Determine row/col in the patch grid
+                patch_row = patch_idx_actual // patches_per_col
+                patch_col = patch_idx_actual % patches_per_row
+                # Pixel boundaries
+                x_start = patch_col * self.patch_size
+                x_end = (patch_col + 1) * self.patch_size
+                y_start = patch_row * self.patch_size
+                y_end = (patch_row + 1) * self.patch_size
+                # Fill patch area with a grey color
+                # if sat_highlight.dtype == np.uint8:
+                #     grey_value = 128
+                # else:
+                #     grey_value = 0.5
+                # sat_highlight[y_start:y_end, x_start:x_end, :] = grey_value
+                # Blue color for positive patches (transparent)
+                if patch_is_pos[idx]:
+                    sat_highlight[y_start:y_end, x_start:x_end, 0] = 0
+                    sat_highlight[y_start:y_end, x_start:x_end, 1] = 0
+                    sat_highlight[y_start:y_end, x_start:x_end, 2] = 255
+                # Red color for negative patches (transparent)
+                else:
+                    sat_highlight[y_start:y_end, x_start:x_end, 0] = 255
+                    sat_highlight[y_start:y_end, x_start:x_end, 1] = 0
+                    sat_highlight[y_start:y_end, x_start:x_end, 2] = 0
+            # -----------------------------------------------------------------
+            # Plot results
+            fig, axes = plt.subplots(1, 3, figsize=(12, 6))
+            fig.suptitle(f"Query: {species_name}")
+            # Ground image
+            img_orig = Image.open(img_path_viz)
+            axes[0].imshow(img_orig)
+            axes[0].set_title("Ground Image")
+            axes[0].axis("off")
+            # Satellite image
+            axes[1].imshow(sat_highlight)
+            axes[1].set_title("Sat Image")
+            axes[1].axis("off")
+            # Heatmap
+            heatmap_np = self.heatmap_unnormalized
+            im = axes[2].imshow(heatmap_np, cmap="viridis")
+            axes[2].set_title(
+                f"Heatmap at TTA Step {step:03d} ({side_dim}x{side_dim})"
+            )
+            axes[2].axis("off")
+            fig.colorbar(im, ax=axes[2], fraction=0.046, pad=0.04)
+            plt.tight_layout()
+            plt.show()
+if __name__ == "__main__":
+    # # (CHANGE ME!) PARAMS: VAL
+    # clip_seg_tta = ClipSegTTA(
+    #     img_dir="/mnt/hdd/inat2021_ds/inat21",
+    #     imo_dir="/mnt/hdd/inat2021_ds/sat_test_jpg_512px",
+    #     json_path="/mnt/hdd/inat2021_ds/inat21_val.json",
+    #     sat_filtered_json_path="/mnt/hdd/inat2021_ds/2_OTHERS/inat21_filtered_pixel_clip_val.json",
+    #     sat_to_img_ids_json_path="/mnt/hdd/inat2021_ds/2_OTHERS/filtered_mapping_sat_to_img_ids_val.json",
+    #     patch_size=14,
+    #     sat_checkpoint_path="/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_190225_CLIP-L-336/satbind-epoch=02-val_loss=2.40.ckpt", # /home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_160225_NO_DATASET_SHUFFLE/satbind-epoch=02-val_loss=2.26-BACKUP.ckpt
+    #     sample_index = 45,
+    #     batch_size=1,
+    #     num_workers=1,
+    #     device="cuda",
+    # )
+    # Run TTA on sample index 45, for 10 steps,
+    # visualizing every 2 steps, with heatmap enabled:
+    # # Step 1
+    # print("Executing step 1...")
+    # patch_indices = [422, 204, 45]
+    # patch_is_pos = [True, False, False]
+    # clip_seg_tta.execute_tta(patch_indices, patch_is_pos, tta_steps=10, neg_sample_weight_scale=1.0, num_viz_steps=2, viz_heatmap=True)
+    # # Step 2
+    # print("Executing step 2...")
+    # # patch_indices = [185, 89, 30] # for 256px
+    # patch_indices = [422, 204, 45, 423]  # for 512px
+    # patch_is_pos = [True, False, False, True]
+    # clip_seg_tta.execute_tta(patch_indices, patch_is_pos, tta_steps=10, neg_sample_weight_scale=1.0, num_viz_steps=2, viz_heatmap=True)
+    # # (CHANGE ME!) PARAMS: TRAIN w/ TARGETS
+    # clip_seg_tta = ClipSegTTA(
+    #     img_dir="/mnt/hdd/inat2021_ds/inat21",
+    #     imo_dir="/mnt/hdd/inat2021_ds/sat_train_jpg_512px",
+    #     json_path="/mnt/hdd/inat2021_ds/inat21_train.json",
+    #     sat_filtered_json_path="/mnt/hdd/inat2021_ds/2_OTHERS/inat21_filtered_pixel_clip_train.json",
+    #     # sat_to_img_ids_json_path="/mnt/hdd/inat2021_ds/2_OTHERS/filtered_mapping_sat_to_img_ids_train.json",
+    #     sat_to_img_ids_json_path="/mnt/hdd/inat2021_ds/target_search_ds/OLD/taxon_sat_target_search_100x_per_10-20counts.json",
+    #     patch_size=14,
+    #     sat_checkpoint_path="/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_190225_CLIP-L-336/satbind-epoch=02-val_loss=2.40.ckpt", # /home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_160225_NO_DATASET_SHUFFLE/satbind-epoch=02-val_loss=2.26-BACKUP.ckpt
+    #     sample_index = 99,
+    #     batch_size=1,
+    #     num_workers=1,
+    #     device="cuda",
+    # )
+    # # (CHANGE ME!) PARAMS: TRAIN w/ TARGETS
+    # clip_seg_tta = ClipSegTTA(
+    #     img_dir="/mnt/hdd/inat2021_ds/inat21",
+    #     imo_dir="/mnt/hdd/inat2021_ds/sat_train_jpg_512px",
+    #     json_path="/mnt/hdd/inat2021_ds/inat21_train.json",
+    #     # sat_filtered_json_path="/mnt/hdd/inat2021_ds/2_OTHERS/inat21_filtered_pixel_clip_train.json",
+    #     sat_to_img_ids_json_path="/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/FINAL_COUNTS_COMBINED_top4000_pos_top55000_neg/search_val_in.json",
+    #     sound_data_path='/mnt/hdd/inat2021_ds/2_OTHERS/sound_train',
+    #     patch_size=14,
+    #     sat_checkpoint_path="/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px_search_ds_filtered/pixel_clip_512px_search_ds_180325_CLIP-L-336/satbind-epoch=02-val_loss=2.46-BACKUP.ckpt", # /home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_190225_CLIP-L-336/satbind-epoch=02-val_loss=2.40.ckpt
+    #     sound_checkpoint_path = "/home/user/Taxabind/TaxaBind/SoundBind/checkpoints/BUGFIX_CLIP_TRAIN_CORRECT_NOT_VAL_OUT_TAX_FILTER_TGT_ONLY_v2_130625/soundbind-epoch=19-val_loss=3.96_BACKUP.ckpt",
+    #     sample_index = 8,   # 5,6,8
+    #     batch_size=1,
+    #     num_workers=1,
+    #     device="cuda",
+    #     sat_to_img_ids_json_is_train_dict=False,
+    # )
+    # (CHANGE ME!) PARAMS: TRAIN w/ TARGETS
+    clip_seg_tta = ClipSegTTA(
+        img_dir="/mnt/hdd/inat2021_ds/inat21",
+        imo_dir="/mnt/hdd/inat2021_ds/sat_train_jpg_512px",
+        json_path="/mnt/hdd/inat2021_ds/inat21_train.json",
+        # sat_filtered_json_path="/mnt/hdd/inat2021_ds/2_OTHERS/inat21_filtered_pixel_clip_train.json",
+        sat_to_img_ids_json_path="/mnt/hdd/inat2021_ds/2_OTHERS/sound_train/sound_image_pairs_filtered_v3_with_in_domain_taxs_150625/val_in_with_sound_ids_v3.json",
+        sound_data_path='/mnt/hdd/inat2021_ds/2_OTHERS/sound_train',
+        patch_size=14,
+        sat_checkpoint_path="/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px_search_ds_filtered/pixel_clip_512px_search_ds_180325_CLIP-L-336/satbind-epoch=02-val_loss=2.46-BACKUP.ckpt", # /home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_190225_CLIP-L-336/satbind-epoch=02-val_loss=2.40.ckpt
+        sound_checkpoint_path = "/home/user/Taxabind/TaxaBind/SoundBind/checkpoints/BUGFIX_CLIP_TRAIN_CORRECT_NOT_VAL_OUT_TAX_FILTER_TGT_ONLY_v2_130625/soundbind-epoch=19-val_loss=3.96_BACKUP.ckpt",
+        sample_index = 8,   # 5,6,8
+        batch_size=1,
+        num_workers=1,
+        device="cuda",
+        sat_to_img_ids_json_is_train_dict=False,
+        initial_modality="sound"
+    )
+    ###########################################
+    # Scaling with list
+    ###########################################
+    # # print("Executing step 4b...")
+    # patch_indices = [422, 45]
+    # patch_is_pos = [True, False]
+    # # per_sample_weight = [1.0, 1.0]
+    # pos_sample_weight = 1.0
+    # neg_sample_weight = 1.0
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     pos_sample_weight,
+    #     neg_sample_weight,
+    #     tta_steps=10,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # print("Executing step 4b...")
+    patch_indices = [422, 32]
+    patch_is_pos = [True, False]
+    # per_sample_weight = [1.0, 1.0]
+    pos_sample_weight = 1.0
+    neg_sample_weight = 1.0
+    clip_seg_tta.execute_tta(
+        patch_indices,
+        patch_is_pos,
+        pos_sample_weight,
+        neg_sample_weight,
+        tta_steps=30,
+        num_viz_steps=2,
+        viz_heatmap=True,
+        modality="sound")
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=20,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [2, False]
+    # per_sample_weight = [0.1, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=20,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [1.0, 0.1]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [422, 424, 398, 400, 45, 47, 41, 43]
+    # patch_is_pos = [True, True, True, True, False, False, False, False]
+    # per_sample_weight = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 47, 41, 43, 422, 424, 398, 400]
+    # patch_is_pos = [True, True, True, True, False, False, False, False]
+    # per_sample_weight = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 47, 41, 43, 422, 424, 398, 400]
+    # patch_is_pos = [True, True, True, True, False, False, False, False]
+    # per_sample_weight = [1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 47, 41, 43, 422, 424, 398, 400]
+    # patch_is_pos = [True, True, True, True, False, False, False, False]
+    # per_sample_weight = [0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 47, 41, 43, 42, 44, 46, 40]
+    # patch_is_pos = [True, True, True, True, True, True, True, True]
+    # per_sample_weight = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 47, 41, 43, 422, 424, 398, 400]
+    # patch_is_pos = [True, True, True, True, False, False, False, False]
+    # per_sample_weight = [1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.1]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [554, 45, 422]
+    # patch_is_pos = [True, True, False]
+    # per_sample_weight = [1.0, 1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [554, 45, 422]
+    # patch_is_pos = [True, True, False]
+    # per_sample_weight = [1.0, 1.0, 0.1]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [554, 45, 422]
+    # patch_is_pos = [False, True, False]
+    # per_sample_weight = [1.0, 1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [554, 45, 422]
+    # patch_is_pos = [False, True, False]
+    # per_sample_weight = [0.0, 1.0, 0.8]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    ###########################################
+    # Text modality
+    # NOTE: Less aggressive vs image modality
+    ###########################################
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     modality="text",
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [0.1, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     modality="text",
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [1.0, 0.1]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     modality="text",
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    ###########################################
+    # Combined modality
+    # NOTE: Best of both worlds
+    ###########################################
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [1.0, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     modality="combined",
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [0.1, 1.0]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     modality="combined",
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    # # print("Executing step 4b...")
+    # patch_indices = [45, 422]
+    # patch_is_pos = [True, False]
+    # per_sample_weight = [1.0, 0.1]
+    # clip_seg_tta.execute_tta(
+    #     patch_indices,
+    #     patch_is_pos,
+    #     tta_steps=10,
+    #     per_sample_weight=per_sample_weight,
+    #     modality="combined",
+    #     num_viz_steps=2,
+    #     viz_heatmap=True)
+    ###########################################
+    # Real examples
+    ###########################################

Taxabind/Taxabind/SatBind/config.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from easydict import EasyDict as edict
+config = edict()
+# Pixel level CLIP training
+config.img_dir = '/mnt/hdd/inat2021_ds/inat21'
+config.imo_dir = '/mnt/hdd/inat2021_ds/sat_train_jpg_512px'   # sat_train_jpg_256px, sat_train_jpg_512px
+config.imo_dir_val = '/mnt/hdd/inat2021_ds/sat_test_jpg_512px'    # sat_test_jpg_256px, sat_test_jpg_512px
+config.train_json_path = '/mnt/hdd/inat2021_ds/inat21_train.json'
+config.val_json_path = '/mnt/hdd/inat2021_ds/inat21_val.json' # no filter needed
+# config.sat_to_img_ids_train_json_path = '/mnt/hdd/inat2021_ds/filtered_mapping_sat_to_img_ids_train.json'
+config.sat_to_img_ids_train_json_path = '/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/FINAL_COUNTS_COMBINED_top4000_pos_top55000_neg/SEARCH_DS_filtered_mapping_sat_to_img_ids_train_3-20counts.json'
+config.sat_to_img_ids_val_json_path = '/mnt/hdd/inat2021_ds/filtered_mapping_sat_to_img_ids_val.json'
+# config.filtered_train_json_path = '/mnt/hdd/inat2021_ds/inat21_filtered_pixel_clip_v2_train.json'
+# config.filtered_val_json_path = '/mnt/hdd/inat2021_ds/inat21_filtered_pixel_clip_val.json' # no filter needed
+# # Pixel level CLIP training (Toy example)
+# config.img_dir = '../scripts/preprocess/expt/fake_img_dataset'
+# config.imo_dir = '../scripts/preprocess/expt/sat_train_dataset'
+# config.imo_dir_val = '../scripts/preprocess/expt/sat_val_dataset'
+# config.train_json_path = '../scripts/preprocess/expt/inat2021_train_micro.json'
+# config.val_json_path = '../scripts/preprocess/expt/inat2021_val_micro.json'
+# config.sat_to_img_ids_train_json_path = '../scripts/preprocess/expt/sat_filtered/filtered_mapping_sat_to_img_ids_train_micro.json'
+# config.sat_to_img_ids_val_json_path = '../scripts/preprocess/expt/sat_filtered/filtered_mapping_sat_to_img_ids_val_micro.json'
+# # config.filtered_train_json_path = '../scripts/preprocess/expt/inat2021_train_micro.json'
+# # config.filtered_val_json_path = '../scripts/preprocess/expt/inat2021_val_micro.json' # no filter needed
+# # Image level CLIP training
+# config.img_dir = '/mnt/hdd/inat2021_ds/inat21'
+# config.imo_dir = '/mnt/hdd/inat2021_ds/sat_train_jpg'
+# config.imo_dir_val = '/mnt/hdd/inat2021_ds/sat_test_jpg'
+# config.train_json_path = '/mnt/hdd/inat2021_ds/inat21_filtered_img_clip_train.json'
+# config.val_json_path = '/mnt/hdd/inat2021_ds/inat21_val.json' # no filter needed
+# # Image level CLIP training (Toy example)
+# config.img_dir = '../scripts/preprocess/expt/fake_img_dataset'
+# config.imo_dir = '../scripts/preprocess/expt/sat_train_dataset'
+# config.imo_dir_val = '../scripts/preprocess/expt/sat_val_dataset'
+# config.train_json_path = '../scripts/preprocess/expt/inat2021_train_micro.json'
+# config.val_json_path = '../scripts/preprocess/expt/inat2021_val_micro.json'
+# batch_size * accumulate_grad_batches * devices = MUST BE CONSTANT (i.e. 256 * 8 * 2 = 4096)
+config.batch_size = 32 # 256
+config.lr = 1e-4    # 1e-4
+config.accumulate_grad_batches = 64
+config.max_epochs = 20
+config.num_workers = 16
+config.devices = 2 # 2
+config.val_check_interval = 0.5
+config.sat_encoder = 'openai/clip-vit-large-patch14-336' # openai/clip-vit-base-patch16, openai/clip-vit-large-patch14-336
+config.patch_size = 14
+config.save_dir = 'checkpoints'
+config.filename = 'satbind-{epoch:02d}-{val_loss:.2f}'
+config.locked_tuning = True
+config.resume_from_checkpoint = False
+config.resume_checkpoint_name = 'satbind-resume'
+print("config: \n", config)

Taxabind/Taxabind/SatBind/dataset.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import math
+from pathlib import Path
+from matplotlib import pyplot as plt
+from torch.utils.data import Dataset
+from torchvision import transforms
+import json
+import os
+from PIL import Image
+from datetime import datetime
+from torchvision.transforms import v2
+import torch
+import numpy as np
+import glob
+# Just to import SoundBind (1 dir back)
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from SoundBind.sound_encoder import get_audio_clap
+class SatNatDataset(Dataset):
+    def __init__(self, img_dir, imo_dir, json_path, sat_to_img_ids_json_path, patch_size, mode='train', get_img_path=False, sat_to_img_ids_json_is_train_dict=True, tax_to_filter_val="", sound_data_path=None):
+        self.img_dir = img_dir
+        self.imo_dir = imo_dir
+        self.patch_size = patch_size
+        self.get_img_path = get_img_path
+        self.mode = mode
+        self.sat_to_img_ids_json_is_train_dict = sat_to_img_ids_json_is_train_dict
+        self.tax_to_filter_val = tax_to_filter_val
+        self.sound_data_path = sound_data_path
+        # ADDED
+        self.current_epoch = 0
+        self.json = json.load(open(json_path, 'r'))
+        # self.sat_filt_json = json.load(open(sat_filtered_json_path, 'r'))
+        self.sat_to_img_ids_json_path = json.load(open(sat_to_img_ids_json_path, 'r'))
+        self.images = self.json['images']
+        self.annot = self.json['annotations']
+        for i in range(len(self.images)):
+            assert self.images[i]['id'] == self.annot[i]['id']
+            self.images[i]['label'] = self.annot[i]['category_id']
+        self.filtered_json = [d for d in self.images if d['latitude'] is not None and d['longitude'] is not None]
+        self.species_text = list(set([" ".join(d['file_name'].split("/")[1].split("_")[1:]) for d in self.filtered_json]))
+        # self.sat_filtered_json = [d for d in self.sat_filt_json['images'] if d['latitude'] is not None and d['longitude'] is not None]
+        # self.sat_paths = {d['id']: str(d['id'])+'_'+str(d['latitude'])+'_'+str(d['longitude'])+'.jpg' for d in self.sat_filtered_json}
+        self.inat_json_dict = {
+            "images": {img["id"]: img for img in self.images},
+            "annotations": {ann["id"]: ann for ann in self.annot},
+        }
+        # Expand dict
+        self.sat_to_img_ids_tuples = []
+        if self.sat_to_img_ids_json_is_train_dict:
+            # for i in range(len(self.sat_filtered_json)):
+            #     id = self.sat_filtered_json[i]['id']
+            #     sat_id = Path(self.sat_paths[id]).stem      # remove file extension
+            #     img_ids = self.sat_to_img_ids_json_path[sat_id]["img_ids"]
+            for sat_key, sat_sample in self.sat_to_img_ids_json_path.items():
+                id = sat_sample["id"] # int(sat_key.split("_")[0]) # sat_sample['id']
+                sat_path = sat_sample["sat_path"]
+                img_ids = sat_sample["img_ids"]
+                # img_locs = sat_sample["target_positions"] # NOTE: Not accurate after resize
+                for img_id in img_ids:
+                    self.sat_to_img_ids_tuples.append((id, sat_path, img_id))
+            print("len(self.sat_to_img_ids_json_path): ", len(self.sat_to_img_ids_json_path))
+            print("len(self.sat_to_img_ids_tuples): ", len(self.sat_to_img_ids_tuples))
+        else:
+            self.filtered_val_ds_by_tax = [d for d in self.sat_to_img_ids_json_path if self.tax_to_filter_val in d['taxonomy']]
+        if mode == 'train':
+            self.img_transform = transforms.Compose([
+                    transforms.Resize((256, 256)),
+                    transforms.RandomCrop((224, 224)),
+                    transforms.RandomHorizontalFlip(0.5),
+                    transforms.GaussianBlur(5, (0.01, 1.0)),
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                    std=[0.229, 0.224, 0.225])
+            ])
+            self.imo_transform = transforms.Compose([
+                # transforms.Resize((256,256)),
+                # transforms.RandomCrop((224, 224)),
+                # transforms.RandomHorizontalFlip(0.5),
+                # transforms.Resize((224,224)),
+                transforms.Resize((336,336)),
+                transforms.GaussianBlur(5, (0.01, 1.0)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                    std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            self.img_transform = transforms.Compose([
+                    transforms.Resize((256, 256)),
+                    transforms.CenterCrop((224, 224)),
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                    std=[0.229, 0.224, 0.225])
+            ])
+            self.imo_transform = transforms.Compose([
+                # transforms.Resize((256,256)),
+                # transforms.CenterCrop((224, 224)),
+                # transforms.Resize((224,224)),
+                transforms.Resize((336,336)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                    std=[0.229, 0.224, 0.225])
+            ])
+        self.debug_img_viz_transform = transforms.Compose([
+            transforms.Resize((256, 256)),
+            transforms.CenterCrop((224, 224))
+        ])
+        self.debug_imo_viz_transform = transforms.Compose([
+            # transforms.Resize((224,224))
+            transforms.Resize((336,336))
+        ])
+    def __len__(self):
+        # return len(self.filtered_json)
+        # return len(self.sat_filtered_json)
+        return len(self.sat_to_img_ids_tuples)
+    def __getitem__(self, idx):
+        # # DEBUG: Visualize original image and IMO pair
+        # self.debug_viz_img_imo_orig_pair(idx)
+        if not self.sat_to_img_ids_json_is_train_dict:
+            print("Json is not dict. Please reformat for training!")
+            exit()
+        ## Pixel-level CLIP
+        # Get sat img
+        # id = self.sat_filtered_json[idx]['id']
+        id, sat_path, img_id = self.sat_to_img_ids_tuples[idx]
+        imo_path = os.path.join(self.imo_dir, sat_path)
+        imo = self.imo_transform(Image.open(imo_path))
+        sat_id = Path(sat_path).stem      # remove file extension
+        img_path = os.path.join(self.img_dir, self.inat_json_dict["images"][img_id]["file_name"])
+        img = self.img_transform(Image.open(img_path))
+        # # Map lat-lon to pixel in sat img
+        sat_min_lon = self.sat_to_img_ids_json_path[sat_id]["sat_bounds"]["min_lon"]
+        sat_min_lat = self.sat_to_img_ids_json_path[sat_id]["sat_bounds"]["min_lat"]
+        sat_max_lon = self.sat_to_img_ids_json_path[sat_id]["sat_bounds"]["max_lon"]
+        sat_max_lat = self.sat_to_img_ids_json_path[sat_id]["sat_bounds"]["max_lat"]
+        img_lon = self.inat_json_dict["images"][img_id]["longitude"]
+        img_lat = self.inat_json_dict["images"][img_id]["latitude"]
+        row, col = self.latlon_to_pixel(img_lat, img_lon, sat_min_lat, sat_max_lat, sat_min_lon, sat_max_lon, imo.shape[2], imo.shape[1])
+        patch_idx = self.pixel_to_patch_idx(row, col, self.patch_size, imo.shape[2], imo.shape[1])
+        patch_idx += 1  # account for [CLS] token at the start of ViT input sequence
+        species_text = " ".join(self.inat_json_dict["images"][img_id]['file_name'].split("/")[1].split("_")[1:])
+        # # # DEBUG: Visualize patch
+        # img_debug = np.array(self.debug_img_viz_transform(Image.open(img_path)))
+        # imo_debug = np.array(self.debug_imo_viz_transform(Image.open(imo_path)))
+        # self.debug_viz_patch(img_debug, imo_debug, patch_idx, self.patch_size, species_text)
+        if self.get_img_path:
+            return img_path, imo_path, img, imo, self.inat_json_dict["annotations"][img_id]['category_id'], patch_idx, species_text, self.species_text.index(species_text)
+        else:
+            return img, imo, self.inat_json_dict["annotations"][img_id]['category_id'], patch_idx, species_text, self.species_text.index(species_text)
+        # NOTE: Image-level CLIP
+        # img_path = os.path.join(self.img_dir, self.filtered_json[idx]['file_name'])
+        # imo_path = os.path.join(self.imo_dir, self.sat_paths[self.filtered_json[idx]['id']])
+        # img = self.img_transform(Image.open(img_path))
+        # imo = self.imo_transform(Image.open(imo_path))
+        # species_text = " ".join(self.filtered_json[idx]['file_name'].split("/")[1].split("_")[1:])
+        # return img, imo, self.filtered_json[idx]['label'], species_text, self.species_text.index(species_text)
+    def latlon_to_pixel(self, lat, lon, lat_min, lat_max, lon_min, lon_max, img_width, img_height):
+        lat_res = (lat_max - lat_min) / img_height
+        lon_res = (lon_max - lon_min) / img_width
+        col = int(math.floor((lon - lon_min) / lon_res))
+        row = int(math.floor((lat_max - lat) / lat_res))
+        return row, col
+    def pixel_to_patch_idx(self, row, col, patch_size, img_width, img_height):
+        patch_size_width = patch_size
+        patch_size_height = patch_size
+        patch_row = row // patch_size_height
+        patch_col = col // patch_size_width
+        patch_idx = patch_row * (img_width // patch_size) + patch_col
+        return patch_idx
+    def set_epoch(self, epoch):
+        self.current_epoch = epoch
+    ###########################################################
+    def debug_viz_img_imo_orig_pair(self, idx):
+        img_path = os.path.join(self.img_dir, self.filtered_json[idx]['file_name'])
+        imo_path = os.path.join(self.imo_dir, self.sat_paths[self.filtered_json[idx]['id']])
+        # img = self.img_transform(Image.open(img_path))
+        # imo = self.imo_transform(Image.open(imo_path))
+        # species_text = " ".join(self.filtered_json[idx]['file_name'].split("/")[1].split("_")[1:])
+        # Create a side-by-side plot
+        fig, axes = plt.subplots(1, 2, figsize=(10, 5))
+        img_np = Image.open(img_path)
+        imo_np = Image.open(imo_path)
+        axes[0].imshow(img_np)
+        axes[0].set_title('Image')
+        axes[0].axis('off')
+        axes[1].imshow(imo_np)
+        axes[1].set_title('IMO')
+        axes[1].axis('off')
+        plt.tight_layout()
+        plt.show()
+    def debug_viz_patch(self, img_np, imo_np, patch_idx, patch_size, species_text):
+        # ----- Compute the patch boundaries -----
+        # Since your patch_idx has been incremented by 1 (for [CLS]), subtract 1 to get the zero-based index.
+        patch_idx_actual = patch_idx - 1
+        # Get the image dimensions from the satellite image (imo).
+        # Note: In your code, imo.shape[2] is width and imo.shape[1] is height.
+        H, W = imo_np.shape[0], imo_np.shape[1]
+        # Compute the number of patches per row.
+        patches_per_col = W // patch_size
+        patches_per_row = H // patch_size
+        # Determine the patch's row and column in the grid.
+        patch_row = patch_idx_actual // patches_per_col
+        patch_col = patch_idx_actual % patches_per_row
+        # Calculate pixel boundaries for the patch.
+        x_start = patch_col * patch_size
+        x_end   = (patch_col + 1) * patch_size
+        y_start = patch_row * patch_size
+        y_end   = (patch_row + 1) * patch_size
+        # ----- Extract the patch from the satellite image -----
+        imo_patch = imo_np[y_start:y_end, x_start:x_end, :]
+        # ----- Create a highlighted version of the satellite image -----
+        # We will grey out the patch region.
+        imo_highlight = imo_np.copy()
+        # Define a grey value based on the image's data type.
+        if imo_highlight.dtype == np.uint8:
+            grey_value = 128  # for 0-255 images
+        else:
+            grey_value = 0.5  # for images normalized to [0, 1]
+        # Replace the patch region with the grey color.
+        imo_highlight[y_start:y_end, x_start:x_end, :] = grey_value
+        # ----- Plotting the three images side by side -----
+        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+        fig.suptitle(f"Species: {species_text}")
+        # Left: Satellite image with the patch greyed out.
+        axes[0].imshow(imo_highlight)
+        axes[0].set_title("IMO with Patch Greyed Out")
+        axes[0].axis("off")
+        # Middle: Cropped patch from the satellite image.
+        axes[1].imshow(imo_patch)
+        axes[1].set_title("IMO Patch")
+        axes[1].axis("off")
+        # Right: Ground image.
+        axes[2].imshow(img_np)
+        axes[2].set_title("Ground Image")
+        axes[2].axis("off")
+        plt.tight_layout()
+        plt.show()
+    ###########################################################
+    def get_search_ds_data(self, idx):
+        if self.sat_to_img_ids_json_is_train_dict:
+            print("Json is dict. Please reformat for target search!")
+            exit()
+        # sat_sample = self.sat_to_img_ids_json_path[idx] if self.sat_to_img_ids_json_is_train_dict else self.filtered_val_ds_by_tax[idx]
+        bounded_idx = idx % len(self.filtered_val_ds_by_tax)
+        if idx >= len(self.filtered_val_ds_by_tax):
+            print("??? bounded_idx: ", bounded_idx)
+        sat_sample = self.filtered_val_ds_by_tax[bounded_idx]
+        target_positions = sat_sample["target_positions"]
+        imo_path = os.path.join(self.imo_dir, sat_sample["sat_path"])
+        imo = self.imo_transform(Image.open(imo_path))
+        img_paths = []
+        imgs = []
+        species_texts = []
+        for img_id in sat_sample["img_ids"]:
+            img_path = os.path.join(self.img_dir, self.inat_json_dict["images"][img_id]["file_name"])
+            img = self.img_transform(Image.open(img_path))
+            img_paths.append(img_path)
+            imgs.append(img)
+            species_text = " ".join(self.inat_json_dict["images"][img_id]['file_name'].split("/")[1].split("_")[1:])
+            species_texts.append(species_text)
+        imgs = torch.stack(imgs)    # Stack all images into a single tensor
+        if len(set(species_texts)) > 1:
+            print("Species mismatch in search dataset!")
+            exit()
+        else:
+            species_name = species_texts[0]
+            gt_mask_name = str(sat_sample["id"]) + "_" + sat_sample["taxonomy"] + ".png"   # Saved with 2x '_' by accident
+            gt_mask_name = gt_mask_name.replace(" ", "_")
+        # Consider sound if valid
+        sounds = []
+        sound_ids = []
+        # if self.sound_data_path is not None and "sound_ids" in sat_sample:
+        #     for sound_id in sat_sample["sound_ids"]:
+        #         sound_path = os.path.join(self.sound_data_path,"sounds_mp3",str(sound_id)+"."+'mp3')
+        #         sound = get_audio_clap(sound_path) # , sound_format)
+        #         for k in sound.keys():
+        #             sound[k] = sound[k].squeeze(0)
+        #         sounds.append(sound)
+        #         sound_ids.append(sound_id)
+        #     sounds = torch.stack(sounds)
+        # NOTE: Cannot stack cos sound format is different (Use first index)
+        if self.sound_data_path is not None and "sound_ids" in sat_sample:
+            sound_id = sat_sample["sound_ids"][0]
+            sound_path = os.path.join(self.sound_data_path,"sounds_mp3",str(sound_id)+"."+'mp3')
+            sound = get_audio_clap(sound_path) # , sound_format)
+            # NOTE: no need to squeeze if not sampled by Pytorch Lightning
+            # for k in sound.keys():
+            #     sound[k] = sound[k].squeeze(0)
+            sounds = [sound]
+            sound_ids = [sound_id]
+        ###################################
+        ########################################################################################################
+        # # TEMP OVERRIDE:
+        # imo_path = "/home/user/search-tta-demo/examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/80645_39.76079_-74.10316.jpg"
+        # imo = self.imo_transform(Image.open(imo_path).convert("RGB"))
+        # img_paths = ["/home/user/search-tta-demo/examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/cc1ebaf9-899d-49f2-81c8-d452249a8087.jpg"]
+        # imgs = [self.img_transform(Image.open(img_paths[0]))]
+        # imgs = torch.stack(imgs)
+        # species_name = "Animalia Chordata Aves Charadriiformes Laridae Larus marinus"
+        # gt_mask_name = "Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus"
+        # sounds = []
+        # sound_ids = []
+        # sound_id = 89758229
+        # sound_path = "/home/user/search-tta-demo/examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/89758229.mp3"
+        # sound = get_audio_clap(sound_path) # , sound_format)
+        # # NOTE: no need to squeeze if not sampled by Pytorch Lightning
+        # # for k in sound.keys():
+        # #     sound[k] = sound[k].squeeze(0)
+        # sounds = [sound]
+        # sound_ids = [sound_id]
+        # # 2020 and beyond
+        # target_positions = [
+        #                     # (84, 57),     # 2015
+        #                     # (59, 323),      # Shouldn't be in water
+        #                     (50, 347),      # ADDED: Shouldn't be in water
+        #                     # (54, 332),    # In water
+        #                     # (19, 507),    # 2014
+        #                     (68, 444),      # shifted down
+        #                     (136, 505),
+        #                     (142, 465),
+        #                     (345, 343),
+        #                     (130, 241),
+        #                     (367, 281),     # Should be more right actually
+        #                     # (463, 351),
+        #                     # (513, 381),   # Out of bound
+        #                     # (405, 162),   # 2019
+        #                     (315, 80)       # Shifted down-left
+        #                     ]
+        ########################################################################################################
+        return img_paths, imo_path, imgs, imo, sounds, sound_ids, species_name, target_positions, gt_mask_name

Taxabind/Taxabind/SatBind/kmeans_clustering.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import math
+import os
+import random
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import matplotlib.colors as colors
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from kneed import KneeLocator
+import scipy.ndimage as ndimage
+from collections import Counter
+# import matplotlib
+# matplotlib.use("Agg")  # <-- key line to avoid tkinter dependency
+class CombinedSilhouetteInertiaClusterer:
+    """
+    A class that:
+      1) Runs combined silhouette & WGSS (inertia) analysis to find the best k.
+      2) Performs a 2D reshape of patch labels.
+      3) Optionally smooths the cluster map.
+      4) Plots the final clustering (before & after smoothing).
+      5) Returns the *raw* (unsmoothed) KMeans labels as a 1D array.
+    """
+    def __init__(
+        self,
+        k_min=1,
+        k_max=8,
+        k_avg_max=4,
+        silhouette_threshold=0.15,
+        relative_threshold=0.15,
+        random_state=0,
+        min_patch_size=5,
+        n_smooth_iter=2,
+        ignore_label=-1,
+        plot=False,
+        gifs_dir = "./"
+    ):
+        """
+        Parameters
+        ----------
+        k_min : int
+            Minimum number of clusters for KMeans.
+        k_max : int
+            Maximum number of clusters for KMeans.
+        k_avg_max : int
+            Upper bound on k after combining elbow & silhouette if they disagree.
+        silhouette_threshold : float
+            Minimum silhouette score at k=2 to justify splitting.
+        relative_threshold : float
+            Minimum % improvement in inertia from k=1→k=2 to justify splitting.
+        random_state : int
+            RNG seed for KMeans.
+        min_patch_size : int
+            Patches smaller than this threshold are smoothed.
+        n_smooth_iter : int
+            Number of smoothing iterations.
+        ignore_label : int
+            Label to ignore in smoothing step.
+        """
+        self.k_min = k_min
+        self.k_max = k_max
+        self.k_avg_max = k_avg_max
+        self.silhouette_threshold = silhouette_threshold
+        self.relative_threshold = relative_threshold
+        self.random_state = random_state
+        self.min_patch_size = min_patch_size
+        self.n_smooth_iter = n_smooth_iter
+        self.ignore_label = ignore_label
+        self.plot = plot
+        self.gifs_dir = gifs_dir
+        self.final_k = None
+        self.final_labels_1d = None    # 1D array of cluster labels (unsmoothed)
+        self.smoothed_labels_2d = None # 2D array of labels (smoothed)
+        self.kmeans_frame_files = []
+    ##############################
+    # Helper functions
+    ##############################
+    def find_largest_connected_component(self, cluster_map, label):
+        """
+        Identifies the largest connected component (patch) of a given label in the cluster map.
+        Uses **4-connectivity** (N, S, E, W) to define connected regions.
+        Parameters:
+        -----------
+        cluster_map : ndarray (H, W)
+            The 2D label map.
+        label : int
+            The label whose largest connected patch we want to find.
+        Returns:
+        --------
+        largest_component_mask : ndarray (H, W)
+            A binary mask where `True` indicates pixels belonging to the largest connected component.
+        """
+        mask = (cluster_map == label)
+        # Label all connected components using **4-connectivity**
+        labeled_components, num_components = ndimage.label(
+            mask,
+            structure=np.array([[0,1,0], [1,1,1], [0,1,0]])
+        )
+        if num_components == 0:
+            return np.zeros_like(cluster_map, dtype=bool)  # No components found
+        # Compute sizes of each connected component
+        component_sizes = np.bincount(labeled_components.ravel())[1:]  # Ignore background (0)
+        largest_component_idx = np.argmax(component_sizes) + 1  # +1 to skip background
+        # Create a mask for the largest component
+        largest_component_mask = (labeled_components == largest_component_idx)
+        return largest_component_mask
+    def smooth_small_patches(self, labels_2d, min_patch_size=5, n_iter=1, ignore_label=-1):
+        """
+        Smooths **only small disconnected patches** (relative to the largest patch) within each cluster.
+        Uses **majority voting** in a 4-connected neighborhood (N, S, E, W).
+        Ignores neighboring pixels that have the `ignore_label` when computing the majority vote.
+        Implements a **tie-breaker**: If there is a tie between the original class and another class,
+        the original class is retained.
+        Parameters:
+        -----------
+        labels_2d : ndarray of shape (H, W)
+            Integer labels (e.g., cluster IDs), including an ignore label (-1).
+        min_patch_size : int
+            Patches smaller than this threshold are considered **isolated** and will be smoothed.
+        n_iter : int
+            Number of smoothing iterations.
+        ignore_label : int
+            Label value to be ignored when counting neighbors.
+        Returns:
+        --------
+        smoothed_labels : ndarray of shape (H, W)
+            The label map after selective smoothing.
+        """
+        H, W = labels_2d.shape
+        smoothed = labels_2d.copy()
+        for _ in range(n_iter):
+            new_label = smoothed.copy()  # Clone where updates will be applied
+            unique_labels = np.unique(smoothed)
+            np.random.shuffle(unique_labels)  # Process clusters in random order to avoid bias
+            for label in unique_labels:
+                if label == ignore_label:
+                    continue
+                # Find the largest connected component for this label
+                largest_component_mask = self.find_largest_connected_component(smoothed, label)
+                # Identify small disconnected patches
+                small_patches_mask = (smoothed == label) & (~largest_component_mask)
+                # Label these small patches separately
+                labeled_small_patches, num_patches = ndimage.label(
+                    small_patches_mask,
+                    structure=np.array([[0,1,0], [1,1,1], [0,1,0]])
+                )
+                # Process each small patch individually
+                for patch_id in range(1, num_patches + 1):
+                    patch_mask = (labeled_small_patches == patch_id)
+                    patch_size = np.sum(patch_mask)
+                    if patch_size < min_patch_size:
+                        # Only smooth small isolated patches
+                        for i, j in zip(*np.where(patch_mask)):
+                            # Collect 4-connected neighborhood (N, S, E, W) **excluding ignore_label**
+                            neighbors = []
+                            if i > 0 and smoothed[i-1, j] != ignore_label:  # North
+                                neighbors.append(smoothed[i-1, j])
+                            if i < H - 1 and smoothed[i+1, j] != ignore_label:  # South
+                                neighbors.append(smoothed[i+1, j])
+                            if j > 0 and smoothed[i, j-1] != ignore_label:  # West
+                                neighbors.append(smoothed[i, j-1])
+                            if j < W - 1 and smoothed[i, j+1] != ignore_label:  # East
+                                neighbors.append(smoothed[i, j+1])
+                            if neighbors:  # Avoid empty neighbor list
+                                neighbor_counts = Counter(neighbors)
+                                most_common_label, count = neighbor_counts.most_common(1)[0]
+                                # Tie-breaker condition
+                                tied_labels = [lbl for lbl, cnt in neighbor_counts.items() if cnt == count]
+                                if len(tied_labels) > 1 and smoothed[i, j] in tied_labels:
+                                    # If there's a tie and the original class is involved, keep the original class
+                                    new_label[i, j] = smoothed[i, j]
+                                else:
+                                    # Otherwise, assign the most common label
+                                    new_label[i, j] = most_common_label
+                smoothed = new_label.copy()  # Apply modifications
+        return smoothed
+    def combined_silhouette_inertia_clustering(
+        self,
+        X,
+        k_min=1,
+        k_max=8,
+        k_avg_max=4,
+        silhouette_threshold=0.2,
+        relative_threshold=0.05,
+        random_state=0
+    ):
+        """
+        Runs KMeans for k in [k_min..k_max] exactly once each,
+        collects silhouette scores & inertias, and returns:
+        - best_k
+        - labels for best_k
+        - silhouette_scores (list or None for k=1)
+        - inertias (list)
+        The final k is chosen by both silhouette and elbow (WGSS):
+        1) If #points < 2 -> return k=1 and all-zero labels.
+        2) Compare k=1 vs. k=2 improvement in inertia and silhouette(2).
+        3) If both pass threshold, run k=2..k_max, pick best_k_sil (highest silhouette)
+            and best_k_elbow (via KneeLocator). If they differ, take their mean (rounded).
+        """
+        n_samples = len(X)
+        if n_samples < 2:
+            return 1, np.zeros(n_samples, dtype=int), [None], [None]
+        # --- Fit once for k=1 ---
+        km1 = KMeans(n_clusters=1, random_state=random_state).fit(X)
+        inertia_k1 = km1.inertia_ / n_samples
+        silhouette_k1 = None  # undefined for k=1
+        # print(f"k=1, inertia={inertia_k1:.4f}, silhouette=NA")
+        if k_max < 2:
+            # If k_max=1, no reason to check further
+            return 1, km1.labels_, [silhouette_k1], [inertia_k1]
+        # --- Fit once for k=2 ---
+        km2 = KMeans(n_clusters=2, random_state=random_state).fit(X)
+        inertia_k2 = km2.inertia_ / n_samples
+        sil_k2 = silhouette_score(X, km2.labels_)
+        # print(f"k=2, inertia={inertia_k2:.4f}, silhouette={sil_k2:.4f}")
+        relative_improvement = (inertia_k1 - inertia_k2) / inertia_k1
+        # print(f"[k=1 → k=2] inertia improvement: {relative_improvement:.4%}, sil(k=2)={sil_k2:.4f}")
+        # If improvement is too small or silhouette is too low => remain at k=1
+        if (relative_improvement < relative_threshold) or (sil_k2 < silhouette_threshold):
+            print("Choosing k=1 (failed thresholds).")
+            return 1, km1.labels_, [silhouette_k1, sil_k2], [inertia_k1, inertia_k2]
+        # --- Otherwise fit k=2..k_max and gather inertias & silhouettes ---
+        all_k = range(2, k_max + 1)
+        kmeans_models = {}
+        inertias = []
+        silhouettes = []
+        # We already have k=2
+        kmeans_models[2] = km2
+        inertias.append(inertia_k2)
+        silhouettes.append(sil_k2)
+        for k in range(3, k_max + 1):
+            km = KMeans(n_clusters=k, random_state=random_state).fit(X)
+            kmeans_models[k] = km
+            norm_inertia = km.inertia_ / n_samples
+            inertias.append(norm_inertia)
+            # If k>n_samples, silhouette_score is meaningless, but in normal usage k<<n_samples
+            sil_val = silhouette_score(X, km.labels_) if k <= n_samples else -1
+            silhouettes.append(sil_val)
+            # print(f"k={k}, inertia={norm_inertia:.4f}, silhouette={sil_val:.4f}")
+        # (a) Silhouette-based best_k_sil
+        best_idx_sil = np.argmax(silhouettes)
+        best_k_sil = best_idx_sil + 2  # offset since silhouettes[0] is k=2
+        # (b) Inertia-based best_k_elbow
+        k_candidates = np.arange(2, k_max + 1)
+        if len(k_candidates) == 1:
+            best_k_elbow = 2
+        else:
+            kn = KneeLocator(k_candidates, inertias, curve="convex", direction="decreasing")
+            best_k_elbow = kn.elbow
+            if best_k_elbow is None:
+                print("No elbow found => default to k=1.")
+                best_k_elbow = 1  # fallback
+        print(f"Silhouette-based best_k={best_k_sil}, elbow-based best_k={best_k_elbow}")
+        # Combine if there's disagreement
+        if best_k_sil == best_k_elbow:
+            final_k = max(1, min(best_k_sil, k_avg_max)) # best_k_sil
+        else:
+            avg_k = 0.5 * (best_k_sil + best_k_elbow)
+            final_k = int(math.ceil(avg_k))
+            final_k = max(1, min(final_k, k_avg_max))
+            # print(f"Disagreement => taking final_k=ceil((sil={best_k_sil} + elbow={best_k_elbow})/2) = {final_k}, capped by k_avg_max={k_avg_max}")
+        assert (final_k <= k_avg_max), f"Final k={final_k} is greater than k_avg_max={k_avg_max}"
+        # Get final labels from the chosen KMeans model
+        if final_k == 1:
+            final_labels = km1.labels_
+        else:
+            final_labels = kmeans_models[final_k].labels_
+        return final_k, final_labels, [silhouette_k1] + silhouettes, [inertia_k1] + inertias
+    def compute_region_statistics(self, label_map, heatmap, visited_indices, episode_num=0, step_num=0):
+        """
+        Computes region statistics for the current smoothed label map.
+        Parameters
+        ----------
+        heatmap : ndarray, shape (H, W)
+            A 2D array of probabilities (or intensities) for each patch.
+            Must match the shape of `self.smoothed_labels_2d`.
+        visited_indices : list or array of int
+            A list of flat indices (0-based) indicating which patches
+            the robot has visited. The flat index is equivalent to
+            r * W + c for patch at row r, column c.
+        Returns
+        -------
+        region_dict : dict
+            A dictionary keyed by cluster label ID, with values:
+            {
+            'num_patches': int,
+            'patches_visited': int,
+            'expectation': float
+            }
+        """
+        # Flatten the cluster map and the heatmap to handle indexing uniformly
+        # label_map_2d = label_map # self.smoothed_labels_2d
+        # H, W = label_map_2d.shape
+        label_map_2d = self.smoothed_labels_2d
+        label_map_1d = self.smoothed_labels_2d.ravel()
+        heatmap_1d   = heatmap.ravel()
+        # Identify unique labels (excluding ignore_label if present)
+        unique_labels = np.unique(label_map_1d)
+        region_dict = {}
+        for lbl in unique_labels:
+            if lbl == self.ignore_label:  # skip ignore_label
+                continue
+            region_dict[lbl] = {
+                'num_patches': 0,
+                'patches_visited': 0,
+                'expectation': 0.0
+            }
+        # Accumulate totals for all patches
+        total_patches = len(label_map_1d)
+        for i in range(total_patches):
+            lbl = label_map_1d[i]
+            if lbl == self.ignore_label:
+                continue
+            region_dict[lbl]['num_patches'] += 1
+            region_dict[lbl]['expectation'] += float(heatmap_1d[i])
+        # # Exponential distribution (waiting time) = num_patches / expected_num_tgts
+        for lbl in region_dict:
+            region_dict[lbl]['expectation'] = region_dict[lbl]['num_patches'] / region_dict[lbl]['expectation']
+        # Count only unique visited patches by converting to a set.
+        unique_visited = set(visited_indices)
+        for vi in unique_visited:
+            if vi < 0 or vi >= total_patches:
+                continue  # skip out-of-bounds indices
+            lbl = label_map_1d[vi]
+            # lbl = self.get_label_id(vi)
+            if lbl == self.ignore_label:
+                continue
+            region_dict[lbl]['patches_visited'] += 1
+        if self.plot:
+            self.plot_cluster_map(label_map_2d, heatmap, visited_indices, region_dict, episode_num, step_num)
+        return region_dict
+    def plot_cluster_map(self, cluster_map, heatmap, path_taken, region_stats_dict, episode_num, step_num, cmap='tab20'):
+        # 4) Plot (side-by-side) if requested
+        fig, axes = plt.subplots(1, 3, figsize=(12, 6))
+        axes[0].imshow(cluster_map, cmap='tab20')
+        axes[0].set_title(f"Raw KMeans Clusters")
+        axes[0].axis('off')
+        axes[1].imshow(heatmap, cmap="viridis")
+        axes[1].set_title("Heatmap")
+        axes[1].axis('off')
+        axes[2].imshow(cluster_map, cmap='tab20')
+        axes[2].set_title("Raw KMeans Clusters")
+        axes[2].axis('off')
+        path_rows, path_cols = [], []
+        for i, idx in enumerate(path_taken):
+            rr = idx // cluster_map.shape[1]
+            cc = idx % cluster_map.shape[1]
+            path_rows.append(rr)
+            path_cols.append(cc)
+        axes[2].plot(path_cols, path_rows, c="r", linewidth=2)
+        axes[2].plot(path_cols[-1], path_rows[-1], markersize=12, zorder=99, marker="^", ls="-", c="r", mec="black")
+        axes[2].plot(path_cols[0], path_rows[0], 'co', c="r", markersize=8, zorder=5)
+        # Create legend patches for each region.
+        # We assume region labels are non-negative integers.
+        unique_labels = sorted(region_stats_dict.keys())
+        # For normalization, use max label value (or 1 if max==0)
+        max_label = max(unique_labels) if unique_labels else 1
+        cm = plt.get_cmap(cmap)
+        legend_patches = []
+        for lbl in unique_labels:
+            # Normalize the label to [0,1] for colormap lookup.
+            norm_value = lbl / max_label if max_label > 0 else 0.5
+            color = cm(norm_value)
+            patch = mpatches.Patch(color=color, label=f"R{lbl}")
+            legend_patches.append(patch)
+        # Add legends to both subplots.
+        axes[0].legend(handles=legend_patches, title="Regions", loc='upper right')
+        axes[2].legend(handles=legend_patches, title="Regions", loc='upper right')
+        # Build the legend text for each region using the provided format:
+        # "R{label}: patches={num_patches}, E={expectation:.3f}, visited={num_visited}"
+        legend_lines = []
+        for label, stats in region_stats_dict.items():
+            line = f"R{label}: patches={stats['num_patches']}, E={stats['expectation']:.3f}, visited={stats['patches_visited']}"
+            legend_lines.append(line)
+        legend_text = "\n".join(legend_lines)
+        # Add the legend text as a subtitle at the bottom of the figure
+        # Using fig.text to place the text at the center bottom with a background box
+        fig.text(0.5, 0.05, legend_text, ha='center', va='bottom', fontsize=10,
+                bbox=dict(facecolor='white', alpha=0.8, edgecolor='gray'))
+        # Adjust layout to reserve space for the legend text
+        plt.tight_layout()
+        plt.subplots_adjust(bottom=0.1)  # Prevent overlap
+        # plt.tight_layout(rect=[0, 0.05, 1, 1])
+        # plt.show()
+        # gifs_folder = '/home/user/VLM-Search/inference/test_results/gifs'
+        if not os.path.exists(self.gifs_dir):
+            os.makedirs(self.gifs_dir)
+        plt.savefig(f'{self.gifs_dir}/kmeans_{episode_num}_{step_num}.png'.format(dpi=150))
+        self.kmeans_frame_files.append(f'{self.gifs_dir}/kmeans_{episode_num}_{step_num}.png')
+        plt.close()
+    # ------ OTHER HELPER FUNCTIONS -------#
+    def get_label_id(self, patch_idx):
+        """
+        Given a flattened index (row-major order) within the 24×24 grid,
+        return the average probability of the corresponding merged region.
+        :param patch_idx: Flattened index (0 to H×W-1).
+        :return: The region-average probability, or 0.0 if the patch
+                is out of bounds or not in a labeled region.
+        """
+        # Guard: check we have a merged label map
+        if self.smoothed_labels_2d is None:
+            raise ValueError("Merged markers do not exist. "
+                            "Please call `incorporate_kmeans_boundaries(...)` first.")
+        return self.smoothed_labels_2d.ravel()[patch_idx]
+    # ------ OTHER HELPER FUNCTIONS -------#
+    def get_probs(self, patch_idx, heatmap):
+        """
+        Given a flattened index (row-major order) within the 24×24 grid,
+        return the average probability of the corresponding merged region.
+        :param patch_idx: Flattened index (0 to H×W-1).
+        :return: The region-average probability, or 0.0 if the patch
+                is out of bounds or not in a labeled region.
+        """
+        return heatmap.ravel()[patch_idx]
+    ##############################
+    # Main functions
+    ##############################
+    def fit_predict(self, patch_embeds, map_shape):
+        """
+        Main entry point.
+        Parameters
+        ----------
+        patch_embeds : ndarray, shape (N, D)
+            The satellite patch embeddings to be clustered.
+        map_shape : tuple of (H, W)
+            The 2D layout of patches. Must satisfy H * W = N.
+        do_plot : bool
+            Whether to display the clustering maps before and after smoothing.
+        Returns
+        -------
+        labels_1d : ndarray, shape (N,)
+            The final cluster assignments from KMeans (UNSMOOTHED).
+        """
+        # 1) Run combined silhouette & inertia
+        best_k, final_labels, silhouettes, inertias = self.combined_silhouette_inertia_clustering(
+            X=patch_embeds,
+            k_min=self.k_min,
+            k_max=self.k_max,
+            k_avg_max=self.k_avg_max,
+            silhouette_threshold=self.silhouette_threshold,
+            relative_threshold=self.relative_threshold,
+            random_state=self.random_state
+        )
+        self.final_k = best_k
+        self.final_labels_1d = final_labels.copy()  # store raw labels
+        # 2) Reshape for display
+        H, W = map_shape
+        cluster_map = final_labels.reshape(H, W)
+        # 3) Apply smoothing
+        # cluster_map_smoothed = self.smooth_small_patches(
+        #     labels_2d=cluster_map,
+        #     min_patch_size=self.min_patch_size,
+        #     n_iter=self.n_smooth_iter,
+        #     ignore_label=self.ignore_label
+        # )
+        cluster_map_smoothed = cluster_map
+        self.smoothed_labels_2d = cluster_map_smoothed.copy()
+        # 5) Return the smoothed 2D labels
+        return self.smoothed_labels_2d
+if __name__ == "__main__":
+    file_path="./expt/patch_embeds_99.npy"
+    patch_embeds = np.load(file_path)
+    map_shape = (int(np.sqrt(patch_embeds.shape[0])), int(np.sqrt(patch_embeds.shape[0])))
+    # 1) Load a 24x24 heatmap
+    file_path="./expt/seg_mask_step0_v1.npy"
+    heatmap = np.load(file_path)
+    heatmap = np.clip(heatmap,0,100)
+    # Normalize heatmap
+    num_targets = 19
+    scale = num_targets / (heatmap.sum())
+    heatmap *= scale
+    # Instantiate our clusterer
+    clusterer = CombinedSilhouetteInertiaClusterer(
+        k_min=1,
+        k_max=8,
+        k_avg_max=4,
+        silhouette_threshold=0.15,
+        relative_threshold=0.15,
+        random_state=0,
+        min_patch_size=5,   # smoothing parameter
+        n_smooth_iter=2,    # smoothing parameter
+        ignore_label=-1
+    )
+    # Fit & predict (this will also plot the clusters before & after smoothing)
+    smoothed_labels_2d = clusterer.fit_predict(
+        patch_embeds=patch_embeds,
+        map_shape=map_shape,
+    )
+    # 2) Simulate a random walk
+    max_r, max_c=heatmap.shape[0]-1, heatmap.shape[1]-1
+    steps=100
+    current_pos=(0,0)
+    moves_8=[(-1,-1),(-1,0),(-1,1),
+             ( 0,-1),        ( 0,1),
+             ( 1,-1),( 1,0), ( 1,1)]
+    path_coords=[current_pos]
+    # num_found_list=[0]
+    for _ in range(steps):
+        r,c=current_pos
+        dr,dc=random.choice(moves_8)
+        nr, nc=r+dr,c+dc
+        if 0<=nr<=max_r and 0<=nc<=max_c:
+            current_pos=(nr,nc)
+        path_coords.append(current_pos)
+        # 10% chance for new target
+        # num_found_list.append(random.random()<0.1)
+        # num_found_list.append(0)
+    # num_found_list[0]=1
+    # num_found_list[1]=2
+    # num_found_list[2]=3
+    # Flatten
+    width=heatmap.shape[1]
+    flattened_visits=[]
+    for (rr,cc) in path_coords:
+        idx=rr*width+cc
+        flattened_visits.append(idx)
+    # Input heatmap (for region statistics)
+    region_dict = clusterer.compute_region_statistics(smoothed_labels_2d, heatmap, flattened_visits, plot=True)
+    # # save the smoother labels as .npy
+    # np.save('smoothed_labels_2d.npy', smoothed_labels_2d)
+    # print("Chosen k:", clusterer.final_k)
+    # print("Smoothed labels shape:", smoothed_labels_2d.shape)

Taxabind/Taxabind/SatBind/model.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import os
+import sys
+# Ensure the correct directory is at the front of sys.path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import open_clip
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import numpy as np
+from torch.utils.data import DataLoader
+from transformers import CLIPVisionModelWithProjection
+from dataset import SatNatDataset
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+from config import config
+######################################
+class UpdateDatasetEpochCallback(pl.Callback):
+    def on_train_epoch_start(self, trainer, pl_module):
+        """Called at the start of each training epoch."""
+        print("!!!trainer.current_epoch: ", trainer.current_epoch)
+        current_epoch = trainer.current_epoch
+        # Access the dataset and set the epoch
+        train_loader = trainer.train_dataloader
+        # trainer.train_dataloader can be a list in some configs, handle that if needed
+        dataset = train_loader.dataset
+        # Now set the epoch:
+        if hasattr(dataset, 'set_epoch'):
+            dataset.set_epoch(current_epoch)
+######################################
+def create_pairwise_mask(labels):
+    labels = labels.reshape(-1)
+    num_samples = len(labels)
+    pairwise_mask = torch.zeros(num_samples, num_samples).to(labels.device)
+    for i in range(num_samples):
+        pairwise_mask[i, :] = (labels == labels[i])
+    return pairwise_mask
+def clip_loss(similarity: torch.Tensor, label) -> torch.Tensor:
+    overhead_img_loss = contrastive_loss(similarity, label)
+    ground_img_loss = contrastive_loss(similarity.t(), label.t())
+    return 0.5*torch.mean(torch.sum(overhead_img_loss, dim=-1)) + 0.5*torch.mean(torch.sum(ground_img_loss, dim=-1))
+def contrastive_loss(logits: torch.Tensor, label) -> torch.Tensor:
+    gt = create_pairwise_mask(label)
+    return -gt*torch.log(logits.softmax(-1)+1e-6)
+class SatBind(pl.LightningModule):
+    def __init__(self, train_dataset, val_dataset, **kwargs):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        #initialize bio CLIP with frozen weights
+        self.bio_model, *_ = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
+        if config.locked_tuning:
+            for param in self.bio_model.parameters():
+                param.requires_grad = False
+        #initialize CLIP with trainable weights
+        self.imo_encoder = CLIPVisionModelWithProjection.from_pretrained(config.sat_encoder).train()
+        for layer in self.imo_encoder.children():
+            if hasattr(layer, 'reset_parameters'):
+                layer.reset_parameters()
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.batch_size = kwargs.get('batch_size', config.batch_size)
+        self.lr = kwargs.get('lr', config.lr)
+        # # ADDED
+        clip_cfg = self.imo_encoder.config
+        self.visual_projection_custom = nn.Linear(clip_cfg.hidden_size, 512, bias=False) # clip_cfg.projection_dim)
+        # print("clip_cfg.hidden_size: ", clip_cfg.hidden_size)
+        # print("clip_cfg.projection_dim: ", clip_cfg.projection_dim)
+        # # Since CLIP's vision projection is not used (prevent pytorch lightning issues)
+        # for param in self.imo_encoder.visual_projection.parameters():
+        #     param.requires_grad = False
+        # for param in self.imo_encoder.transformer.parameters():  # If the transformer is unused
+        # param.requires_grad = False
+    def forward(self, batch):
+        img, imo, label, patch_idx, *_ = batch
+        batch_size = img.shape[0]
+        #compute bioclip embeddings
+        img_embeds, *_ = self.bio_model(img)    # (batch_size, proj_dim)
+        # ## ORIGINAL: compute overhead embeddings
+        # imo_embeds = self.imo_encoder(imo).image_embeds     # (batch, proj_dim)
+        # print("[ORIG] imo_embeds.shape: ", imo_embeds.shape)
+        # print("[ORIG] torch.min(imo_embeds): ", torch.min(imo_embeds))
+        # print("[ORIG] torch.max(imo_embeds): ", torch.max(imo_embeds))
+        # NEW:
+        imo_embeds = self.imo_encoder(imo).last_hidden_state    # (batch, Patches, hidden_dim)
+        imo_embeds = imo_embeds[torch.arange(batch_size), patch_idx]  # (batch, hidden_dim)
+        # imo_embeds = self.imo_encoder.vision_model.post_layernorm(imo_embeds)  # (batch, hidden_dim)
+        # imo_embeds = self.imo_encoder.visual_projection(imo_embeds)  # (batch_size, proj_dim)
+        imo_embeds = self.visual_projection_custom(imo_embeds)  # (batch_size, proj_dim)
+        return img_embeds, imo_embeds, label
+    def shared_step(self, batch, return_sim_matrix=False):
+        img_embeds, imo_embeds, label, *_ = self(batch)
+        #normalize embeddings
+        #img embeds is already normalized
+        img_embeds = img_embeds
+        imo_embeds = torch.nn.functional.normalize(imo_embeds, dim=-1)
+        # print("[NORM] torch.min(imo_embeds): ", torch.min(imo_embeds))
+        # print("[NORM] torch.max(imo_embeds): ", torch.max(imo_embeds))
+        #exponentiate the log of temperrature
+        logit_scale = self.logit_scale.exp()
+        #compute similarity
+        img_to_imo_sim = img_embeds @ imo_embeds.t() * logit_scale
+        if return_sim_matrix:
+            img_to_imo_sim_copy = img_to_imo_sim.clone().detach()
+        loss = clip_loss(img_to_imo_sim, label)
+        if return_sim_matrix:
+            return loss, img_to_imo_sim_copy
+        else:
+            return loss
+    def training_step(self, batch, batch_idx):
+        loss = self.shared_step(batch)
+        self.log('train_loss', loss, sync_dist=True, prog_bar=True, on_epoch=True, batch_size=self.batch_size)
+        self.log('temperature', self.logit_scale.data, prog_bar=True, on_epoch=True, batch_size=self.batch_size)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss = self.shared_step(batch)
+        self.log('val_loss', loss, sync_dist=True, prog_bar=True, on_epoch=True, batch_size=self.batch_size)
+        return loss
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,
+                          batch_size=self.batch_size,
+                          num_workers=config.num_workers,
+                          shuffle=True,    # True
+                          persistent_workers=False)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset,
+                          batch_size=self.batch_size,
+                          num_workers=config.num_workers,
+                          shuffle=False,
+                          persistent_workers=False)
+    def configure_optimizers(self):
+        params = self.parameters()
+        self.optim = torch.optim.AdamW(params,
+                                       lr=self.lr,
+                                       betas=(0.9,0.98),
+                                       eps=1e-6
+                                    )
+        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+            optimizer=self.optim,
+            T_0=20,
+            eta_min=1e-6
+        )
+        return [self.optim], [self.scheduler]
+if __name__ == '__main__':
+    img_dir = config.img_dir
+    imo_dir = config.imo_dir
+    imo_dir_val = config.imo_dir_val
+    train_json_path = config.train_json_path
+    val_json_path = config.val_json_path
+    # filtered_train_json_path = config.filtered_train_json_path
+    # filtered_val_json_path = config.filtered_val_json_path
+    sat_to_img_ids_train_json_path = config.sat_to_img_ids_train_json_path
+    sat_to_img_ids_val_json_path = config.sat_to_img_ids_val_json_path
+    patch_size = config.patch_size
+    #define dataset
+    train_dataset = SatNatDataset(img_dir, imo_dir, train_json_path, sat_to_img_ids_train_json_path, patch_size)
+    val_dataset = SatNatDataset(img_dir, imo_dir_val, val_json_path, sat_to_img_ids_val_json_path, patch_size, mode='val')
+    #define model
+    model = SatBind(train_dataset=train_dataset, val_dataset=val_dataset)
+    torch.cuda.empty_cache()
+    checkpoint = ModelCheckpoint(
+        monitor='val_loss',
+        dirpath=config.save_dir,
+        filename=config.filename,
+        mode='min',
+        save_top_k=1,             # save only the best checkpoint (according to val_loss)
+        save_last=True            # also save the last checkpoint every epoch
+        # last_filename=config.filename + "-LAST"
+    )
+    checkpoint.CHECKPOINT_NAME_LAST = config.filename + "-LAST"     # Rename last checkpoint name
+    trainer = pl.Trainer(
+        accelerator='gpu',
+        strategy='ddp_find_unused_parameters_true', # ddp (orig), ddp_find_unused_parameters_true (supress pl issues with unused trainable params)
+        devices=config.devices,
+        max_epochs=config.max_epochs,
+        num_nodes=1,
+        callbacks=[checkpoint, UpdateDatasetEpochCallback()],
+        accumulate_grad_batches=config.accumulate_grad_batches,
+        log_every_n_steps=1,
+        val_check_interval=config.val_check_interval,
+        )
+    if config.resume_from_checkpoint:
+        trainer.fit(model, ckpt_path=f"{config.save_dir}/{config.resume_checkpoint_name}.ckpt")
+    else:
+        trainer.fit(model)

Taxabind/Taxabind/SatBind/watershed_segmentation.py ADDED Viewed

	@@ -0,0 +1,1094 @@

+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+import random
+import math
+import numpy as np
+from collections import Counter
+from scipy import ndimage
+# import matplotlib
+# matplotlib.use("Agg")  # <-- key line to avoid tkinter dependency
+class WatershedBinomial:
+    def __init__(
+        self,
+        N=5.0,                       # total expected # of targets across entire map
+        blur_kernel=(5, 5),
+        sure_fg_factor=0.7,
+        dilation_kernel_size=(3, 3),
+        dilation_iters=3,
+        region_size_thresh=10,
+        plot_img=False,
+        gifs_dir="./gifs",
+        colormap=plt.cm.tab10,
+        title="Watershed + Binomial Steps"
+    ):
+        """
+        A binomial viewpoint:
+         1) We run watershed on a 24x24 heatmap.
+         2) We scale each region's probability p_i so sum of expected targets across
+            all regions = N. Then expected # in region i is E_i = size_i * p_i.
+         3) We track how many targets each region has found so far (#found).
+         4) The average steps to find the next target is size_i/E_i = 1/p_i.
+         5) We store these as class variables so they can be accessed later:
+            - region_prob[lab]
+            - region_visited_str[lab]
+            - region_expected[lab]
+            - region_found[lab]
+            - region_steps_next[lab]
+         6) The final legend includes all this info for each region.
+        """
+        self.N = N
+        self.blur_kernel = blur_kernel
+        self.sure_fg_factor = sure_fg_factor
+        self.dilation_kernel_size = dilation_kernel_size
+        self.dilation_iters = dilation_iters
+        self.region_size_thresh = region_size_thresh
+        self.plot_img = plot_img
+        self.gifs_dir = gifs_dir
+        self.colormap = colormap
+        self.title = title
+        self.raw_heatmap = None
+        self.flattened_visits = None
+        self.num_found_list = None
+        # --------------------------
+        # Intermediate images
+        # --------------------------
+        self.input_heatmap_24x24 = None
+        self.padded_heatmap_26x26 = None
+        self.blurred_heatmap_26x26 = None
+        self.binary_mask_26x26 = None
+        self.dist_transform_26x26 = None
+        self.sure_fg_mask_26x26 = None
+        self.watershed_markers_26x26 = None
+        self.color_watershed_raw_26x26 = None
+        # --------------------------
+        # Final label map + overlays
+        # --------------------------
+        self.final_markers_24x24 = None
+        self.final_output_no_vis_24x24 = None
+        self.final_output_with_vis_24x24 = None
+        # --------------------------
+        # Class variables of interest
+        # --------------------------
+        # region_prob[lab]       -> scaled probability p_i
+        # region_visited_str[lab]-> visited fraction as "visited_count/size_i"
+        # region_expected[lab]   -> E_i = size_i * p_i
+        # region_found[lab]      -> number of targets found so far
+        # region_steps_next[lab] -> size_i/E_i (1/p_i)
+        self.region_prob = {}
+        self.visited_num = {}
+        self.region_visited_str = {}
+        self.region_centroids = {}
+        self.region_expected = {}
+        self.region_found = {}
+        self.region_steps_next = {}
+        # --------------------------
+        # For MERGED segmentation with k-means boundaries
+        # We'll store them in separate variables
+        # --------------------------
+        self.merged_markers_24x24 = None
+        self.merged_output_no_vis = None
+        self.merged_output_with_vis = None
+        self.merged_region_prob = {}
+        self.merged_visited_num = {}
+        self.merged_region_visited_str = {}
+        self.merged_region_expected = {}
+        self.merged_region_found = {}
+        self.merged_region_steps_next = {}
+        self.merged_region_centroids = {}
+        # OTHERS
+        self.segmentor_frame_files = []
+        self.kmeans_merged_frame_files = []
+        # Persistent label-color mapping (Preassign colors for labels)
+        self.label_to_color = {}
+        for i in range(1, 101):
+            color = np.array(self.colormap(i % 10)[:3]) * 255
+            self.label_to_color[i] = color.astype(np.uint8)
+    # --------------------------
+    # Public Entry
+    # --------------------------
+    def run(self, raw_heatmap, flattened_visits, num_found_list, step=0):
+        """
+        1) Watershed pipeline
+        2) Scale probabilities => sum_i #expected = N
+        3) Build final overlays & store region stats
+        4) Plot subplots
+        """
+        self.raw_heatmap = raw_heatmap
+        self.flattened_visits = flattened_visits
+        self.num_found_list = num_found_list
+        self.input_heatmap_24x24 = self.raw_heatmap.copy()
+        # Step A) Watershed + merges
+        self._watershed_on_padded()
+        self.final_markers_24x24 = self._postprocess_cropped_markers()
+        # Step B) Build final binomial-based stats
+        self._build_final_outputs()
+        # Step C) Plot
+        if self.plot_img:
+            self._plot_all_steps(step)
+        return self.final_markers_24x24
+    # --------------------------
+    # Watershed pipeline
+    # --------------------------
+    def _pad_image(self, image, pad_size=1):
+        return np.pad(image, pad_size, mode='edge')
+    def _crop_image(self, image, pad_size=1):
+        return image[pad_size:-pad_size, pad_size:-pad_size]
+    def _watershed_on_padded(self):
+        padded = self._pad_image(self.raw_heatmap, pad_size=1)
+        self.padded_heatmap_26x26 = padded
+        # Normalize
+        norm_padded = (
+            (padded - padded.min()) / (padded.max() - padded.min() + 1e-9)
+            * 255
+        ).astype(np.uint8)
+        # (2) Blur
+        if self.blur_kernel != (0,0):
+            self.blurred_heatmap_26x26 = cv2.GaussianBlur(norm_padded, self.blur_kernel, 0)
+        else:
+            self.blurred_heatmap_26x26 = norm_padded.copy()
+        # (3) Binary
+        _, bin_mask = cv2.threshold(
+            self.blurred_heatmap_26x26,
+            0, 255,
+            cv2.THRESH_BINARY + cv2.THRESH_OTSU
+        )
+        self.binary_mask_26x26 = bin_mask
+        # (4) Distance transform
+        dist_transform = cv2.distanceTransform(bin_mask, cv2.DIST_L2, 5)
+        self.dist_transform_26x26 = dist_transform
+        # (5) Sure FG
+        _, sure_fg = cv2.threshold(
+            dist_transform,
+            self.sure_fg_factor * dist_transform.max(),
+            255, 0
+        )
+        sure_fg = np.uint8(sure_fg)
+        self.sure_fg_mask_26x26 = sure_fg
+        # sure BG
+        kernel = np.ones(self.dilation_kernel_size, np.uint8)
+        sure_bg = cv2.dilate(bin_mask, kernel, iterations=self.dilation_iters)
+        unknown = cv2.subtract(sure_bg, sure_fg)
+        # Markers
+        _, markers = cv2.connectedComponents(sure_fg)
+        markers += 1
+        markers[unknown==255] = 0
+        # (6) Watershed
+        color_img = cv2.applyColorMap(self.blurred_heatmap_26x26, cv2.COLORMAP_VIRIDIS)
+        markers = cv2.watershed(color_img, markers)
+        self.watershed_markers_26x26 = markers
+        self.color_watershed_raw_26x26 = self._color_label_map(markers)
+    def _postprocess_cropped_markers(self):
+        cropped = self._crop_image(self.watershed_markers_26x26, 1)
+        cropped = self._fix_boundary_labels(cropped)
+        cropped = self._separate_subregions(cropped)
+        cropped = self._merge_small_regions(cropped, self.region_size_thresh)
+        return cropped
+    def _fix_boundary_labels(self, markers):
+        b_idx = np.argwhere(markers==-1)
+        neighbors_8 = [
+            (-1,-1), (-1,0), (-1,1),
+            ( 0,-1),         ( 0,1),
+            ( 1,-1), ( 1,0), ( 1,1)
+        ]
+        for (r,c) in b_idx:
+            neigh_labs=[]
+            for dr,dc in neighbors_8:
+                rr,cc=r+dr,c+dc
+                if 0<=rr<markers.shape[0] and 0<=cc<markers.shape[1]:
+                    lb=markers[rr,cc]
+                    if lb>0:
+                        neigh_labs.append(lb)
+            if neigh_labs:
+                markers[r,c]=np.bincount(neigh_labs).argmax()
+            else:
+                markers[r,c]=0
+        return markers
+    def _separate_subregions(self, markers):
+        refined = np.zeros_like(markers, dtype=np.int32)
+        refined[markers==-1] = -1
+        refined[markers==0]  = 0
+        next_label=1
+        labs = np.unique(markers)
+        for lb in labs:
+            if lb<=0:
+                continue
+            region_mask=(markers==lb).astype(np.uint8)
+            n_sub, sub_markers = cv2.connectedComponents(region_mask)
+            for sub_id in range(1,n_sub):
+                subregion_mask=(sub_markers==sub_id)
+                refined[subregion_mask]=next_label
+                next_label+=1
+        return refined
+    def _merge_small_regions(self, markers, min_patch_size, n_iter=1, ignore_label=-99):
+        """
+        Combined approach using:
+        (A) Pixel-wise smoothing of small patches (iterative),
+            but now using 8-connected neighbors.
+        (B) After that, remove any connected component whose
+            size < self.region_size_thresh by merging it into
+            the majority label among its 8-connected boundary.
+        1) We do not change the pixel-wise smoothing logic, except
+        that we consider 8 neighbors instead of 4.
+        2) Then we do a region-based check for all subregions whose
+        size < region_size_thresh. Each such region is assigned
+        to the 8-neighbor majority among that region's boundary
+        pixels. If a tie occurs, we pick the largest label ID
+        or keep the original? (You can define your tie logic
+        as you wish, shown here picking the top or random.)
+        """
+        H, W = markers.shape
+        # -----------------------------------------------------------
+        # (A) Pixel-wise smoothing with 8-connected neighbors
+        #     for small sub-patches of each label
+        # -----------------------------------------------------------
+        markers = ndimage.median_filter(markers, size=3)
+        # -----------------------------------------------------------
+        # (B) Now remove entire subregions < min_patch_size by
+        #     reassigning them to the 8-neighbor majority of boundary
+        # -----------------------------------------------------------
+        # smoothed = markers.copy()
+        structure_8 = np.array([[1,1,1],
+                                [1,1,1],
+                                [1,1,1]], dtype=np.uint8)
+        # identify connected components of each label
+        # step: we'll do a multi-pass until no small region is left
+        changed = True
+        while changed:
+            changed = False
+            label_map = markers
+            labels = np.unique(label_map[label_map>0])
+            for label in labels:
+                # If label patches is > threshold
+                if label < 0 or np.sum(label_map == label) >= min_patch_size:
+                    continue
+                # find connected components of 'label'
+                mask = (label_map == label)
+                labeled_cc, num_cc = ndimage.label(mask, structure=structure_8)
+                if num_cc < 1:
+                    continue
+                for cc_id in range(1, num_cc+1):
+                    cc_mask = (labeled_cc == cc_id)
+                    # size_cc = np.sum(cc_mask)
+                    # if 0<size_cc<min_patch_size:
+                    # compute the 8-neighbor majority from boundary
+                    boundary_pixels = []
+                    coords = np.argwhere(cc_mask)
+                    for (r,c) in coords:
+                        # check if (r,c) is on boundary => or we can check neighbors
+                        for dr in (-1,0,1):
+                            for dc in (-1,0,1):
+                                if dr==0 and dc==0:
+                                    continue
+                                rr,cc2 = r+dr, c+dc
+                                if 0<=rr<H and 0<=cc2<W:
+                                    # If neighbor label != label, it's a boundary
+                                    neighbor_label = label_map[rr, cc2]
+                                    if neighbor_label != label and neighbor_label > 0:
+                                        boundary_pixels.append(neighbor_label)
+                    if boundary_pixels:
+                        counter = Counter(boundary_pixels)
+                        # remove any '0' or negative labels
+                        # for k in list(counter.keys()):
+                        #     if k<=0:
+                        #         del counter[k]
+                        if len(counter)>0:
+                            # pick the label with the largest count
+                            (new_lab, _cnt) = counter.most_common(1)[0]
+                            label_map[cc_mask] = new_lab
+                            changed = True
+                        else:
+                            # no valid neighbor => keep the label, or reassign to 0
+                            label_map[cc_mask] = 0
+                            changed = True
+        return label_map
+    # def largest_component_mask(self, label_map, label):
+    #     """Find largest connected component for 'label' using 8-neighbor structure."""
+    #     mask = (label_map == label)
+    #     structure_8 = np.array([[1,1,1],
+    #                             [1,1,1],
+    #                             [1,1,1]], dtype=np.uint8)
+    #     labeled, num = ndimage.label(mask, structure=structure_8)
+    #     if num < 1:
+    #         return np.zeros_like(mask, dtype=bool)
+    #     # measure region sizes
+    #     sizes = ndimage.sum(mask, labeled, range(1,num+1))
+    #     largest_id = (np.argmax(sizes) + 1)
+    #     return (labeled == largest_id)
+    # NOT IN USE
+    def _merge_small_regions_smoothing(self, markers):
+        label_sizes={}
+        labs=np.unique(markers)
+        for lb in labs:
+            if lb>0:
+                label_sizes[lb]=np.sum(markers==lb)
+        sorted_labels=sorted(label_sizes, key=lambda x: label_sizes[x])
+        neighbors_8=[
+            (-1,-1), (-1,0), (-1,1),
+            ( 0,-1),        ( 0,1),
+            ( 1,-1), ( 1,0), ( 1,1)
+        ]
+        for lb in sorted_labels:
+            size=label_sizes[lb]
+            if 0<size<self.region_size_thresh:
+                coords=np.argwhere(markers==lb)
+                nbr_set=set()
+                for (r,c) in coords:
+                    for dr,dc in neighbors_8:
+                        rr,cc=r+dr,c+dc
+                        if 0<=rr<markers.shape[0] and 0<=cc<markers.shape[1]:
+                            nl=markers[rr,cc]
+                            if nl>0 and nl!=lb:
+                                nbr_set.add(nl)
+                if not nbr_set:
+                    continue
+                max_lb=None
+                max_sz=-1
+                for nlb in nbr_set:
+                    s_nlb=label_sizes.get(nlb,0)
+                    if s_nlb>max_sz:
+                        max_sz=s_nlb
+                        max_lb=nlb
+                markers[markers==lb]=max_lb
+                label_sizes[max_lb]+=size
+                label_sizes[lb]=0
+        return markers
+    # --------------------------
+    # Build binomial-based stats
+    # --------------------------
+    def _build_final_outputs(self):
+        """
+        1) color-labeled final overlay
+        2) visited overlay
+        3) scale region probabilities => sum of E_i = N
+        4) store visited fraction, expected #targets, #found, steps next
+        """
+        markers=self.final_markers_24x24
+        valid_labels=np.unique(markers[markers>0])
+        # color-labeled overlay
+        color_map=self._color_label_map(markers)
+        self.final_output_no_vis_24x24=self._overlay_heatmap_and_labels(self.raw_heatmap, color_map)
+        # visited
+        visited_mask=np.zeros(markers.shape,dtype=bool)
+        h,w=markers.shape
+        for idx in self.flattened_visits:
+            rr=idx//w
+            cc=idx%w
+            if 0<=rr<h and 0<=cc<w:
+                visited_mask[rr,cc]=True
+        with_vis=self.final_output_no_vis_24x24.copy()
+        red_overlay=np.zeros_like(with_vis)
+        alpha=0.3
+        for idx in self.flattened_visits:
+            rr=idx//w
+            cc=idx%w
+            if 0<=rr<h and 0<=cc<w:
+                red_overlay[rr,cc]=[255,0,0]
+        self.final_output_with_vis_24x24=cv2.addWeighted(with_vis,1.0,red_overlay,alpha,0.0)
+        # gather region sums
+        label_sizes={}
+        label_sums={}
+        total_mass=0.0
+        for lb in valid_labels:
+            mask=(markers==lb)
+            size_i=np.sum(mask)
+            label_sizes[lb]=size_i
+            s=np.sum(self.raw_heatmap[mask])
+            label_sums[lb]=s
+            total_mass+=s
+        # scale factor
+        scale=self.N/(total_mass+1e-9)
+        # how many visited patches in each region
+        region_visited={}
+        for lb in valid_labels:
+            region_visited[lb]=np.sum((markers==lb) & visited_mask)
+        # how many targets found in each region
+        region_found_count={lb:0 for lb in valid_labels}
+        for idx, num_found in zip(self.flattened_visits,self.num_found_list):
+            rr=idx//w
+            cc=idx%w
+            lb=markers[rr,cc]
+            if lb>0 and num_found>0:
+                region_found_count[lb]+=num_found
+        # store stats
+        for lb in valid_labels:
+            size_i=label_sizes[lb]
+            raw_sum=label_sums[lb]
+            if size_i>0:
+                p_i=scale*(raw_sum/size_i)
+            else:
+                p_i=0.0
+            self.region_prob[lb]=p_i
+            visited_num=region_visited[lb]
+            # visited fraction as a string, e.g. "23/100"
+            self.visited_num[lb]=visited_num
+            self.region_visited_str[lb]=f"{visited_num}/{size_i}"
+            E_i=size_i*p_i
+            self.region_expected[lb]=E_i
+            fcount=region_found_count[lb]
+            self.region_found[lb]=fcount
+            self.region_centroids[lb] = self._find_centroid_in_region(markers, lb)
+            # steps for the (found+1)-th target => size_i/E_i = 1/p_i
+            if E_i>0:
+                next_steps=(fcount+1)*size_i/E_i
+            else:
+                next_steps=float('inf')
+            self.region_steps_next[lb]=next_steps
+    def _find_centroid_in_region(self, markers, label_id):
+        coords = np.argwhere(markers == label_id)
+        if coords.size == 0:
+            return None
+        r_mean = np.mean(coords[:, 0])
+        c_mean = np.mean(coords[:, 1])
+        dists = (coords[:, 0] - r_mean)**2 + (coords[:, 1] - c_mean)**2
+        idx = np.argmin(dists)
+        return (coords[idx][0], coords[idx][1])
+    # --------------------------
+    # Visualization
+    # --------------------------
+    def _color_label_map(self, markers):
+        out_img=np.zeros((*markers.shape,3),dtype=np.uint8)
+        labs=np.unique(markers[markers>0])
+        # col_array=self.colormap(np.linspace(0,1,len(labs)))
+        for r in range(markers.shape[0]):
+            for c in range(markers.shape[1]):
+                lb = markers[r, c]
+                if lb in self.label_to_color:
+                    out_img[r, c] = self.label_to_color[lb]
+        # OLD: Color keeps jumping
+        # lab2col={}
+        # for i,lb in enumerate(labs):
+        #     lab2col[lb]=(col_array[i][:3]*255).astype(np.uint8)
+        # for r in range(markers.shape[0]):
+        #     for c in range(markers.shape[1]):
+        #         lb=markers[r,c]
+        #         if lb in lab2col:
+        #             out_img[r,c]=lab2col[lb]
+        return out_img
+    def _overlay_heatmap_and_labels(self, heatmap_24x24, color_map):
+        heatmap_rgb=plt.cm.gray((heatmap_24x24/100.0))[:,:,:3]
+        heatmap_rgb=(heatmap_rgb*255).astype(np.uint8)
+        alpha=0.5
+        blended=cv2.addWeighted(heatmap_rgb,1.0,color_map,alpha,0.0)
+        return blended
+    def _plot_all_steps(self, step):
+        fig, axs = plt.subplots(2,4, figsize=(20,10))
+        fig.suptitle(self.title, fontsize=16)
+        fig.text(0.5, 0.05, self._get_subtitles(), ha="center", va="center", fontsize=12)
+        plt.tight_layout()
+        plt.subplots_adjust(bottom=0.1)  # Prevent overlap
+        # (1) Input
+        axs[0,0].imshow(self.input_heatmap_24x24, cmap='viridis')
+        axs[0,0].set_title("1) Input Heatmap")
+        axs[0,0].axis("off")
+        # (2) After Blur
+        if self.blurred_heatmap_26x26 is not None:
+            axs[0,1].imshow(self.blurred_heatmap_26x26, cmap='gray')
+        else:
+            axs[0,1].text(0.2,0.5,"No blur", fontsize=12)
+        axs[0,1].set_title("2) After Gaussian Blur")
+        axs[0,1].axis("off")
+        # (3) Binary Mask
+        if self.binary_mask_26x26 is not None:
+            axs[0,2].imshow(self.binary_mask_26x26, cmap='gray')
+        else:
+            axs[0,2].text(0.2,0.5,"No binary", fontsize=12)
+        axs[0,2].set_title("3) Binary Mask")
+        axs[0,2].axis("off")
+        # (4) Distance Transform
+        if self.dist_transform_26x26 is not None:
+            axs[0,3].imshow(self.dist_transform_26x26, cmap='jet')
+        else:
+            axs[0,3].text(0.2,0.5,"No dist transform", fontsize=12)
+        axs[0,3].set_title("4) Distance Transform")
+        axs[0,3].axis("off")
+        # (5) Sure Foreground
+        if self.sure_fg_mask_26x26 is not None:
+            axs[1,0].imshow(self.sure_fg_mask_26x26, cmap='gray')
+        else:
+            axs[1,0].text(0.2,0.5,"No sure-FG", fontsize=12)
+        axs[1,0].set_title("5) Sure Foreground Mask")
+        axs[1,0].axis("off")
+        # (6) Raw Watershed
+        if self.color_watershed_raw_26x26 is not None:
+            axs[1,1].imshow(self.color_watershed_raw_26x26)
+        else:
+            axs[1,1].text(0.2,0.5,"No watershed", fontsize=12)
+        axs[1,1].set_title("6) Raw Watershed Output")
+        axs[1,1].axis("off")
+        # (7) Final (No Vis)
+        if self.final_output_no_vis_24x24 is not None:
+            axs[1,2].imshow(self.final_output_no_vis_24x24)
+            self._plot_centroids(axs[1,2])
+            self._add_legend(axs[1,2])
+        else:
+            axs[1,2].text(0.2,0.5,"No final", fontsize=12)
+        axs[1,2].set_title("7) Final Output (No Vis)")
+        axs[1,2].axis("off")
+        # (8) Final (With Vis)
+        if self.final_output_no_vis_24x24 is not None:
+            axs[1,3].imshow(self.final_output_no_vis_24x24)   # final_output_with_vis_24x24
+            # Red line for path
+            path_rows, path_cols = [], []
+            targets = []
+            for i, idx in enumerate(self.flattened_visits):
+                rr = idx // self.final_output_no_vis_24x24.shape[1]
+                cc = idx % self.final_output_no_vis_24x24.shape[1]
+                path_rows.append(rr)
+                path_cols.append(cc)
+                if self.num_found_list[i] > 0:
+                    targets.append((rr, cc))
+            axs[1,3].plot(path_cols, path_rows, c="r", linewidth=2)
+            axs[1,3].plot(path_cols[-1], path_rows[-1], markersize=12, zorder=99, marker="^", ls="-", c="r", mec="black")
+            axs[1,3].plot(path_cols[0], path_rows[0], 'co', c="r", markersize=8, zorder=5)
+            for target in targets:
+                axs[1,3].plot(target[1], target[0], color='g', marker='x', linestyle='-', mec="black", markersize=12, markeredgewidth=4, zorder=999)
+            self._plot_centroids(axs[1,3])
+            self._add_legend(axs[1,3])
+        else:
+            axs[1,3].text(0.2,0.5,"No visited out", fontsize=12)
+        axs[1,3].set_title("8) Final Output (With Vis)")
+        axs[1,3].axis("off")
+        # plt.show()
+        if not os.path.exists(self.gifs_dir):
+            os.makedirs(self.gifs_dir)
+        plt.savefig(f'{self.gifs_dir}/watershed_{step}.png'.format(dpi=150))
+        self.segmentor_frame_files.append(f'{self.gifs_dir}/watershed_{step}.png')
+        plt.close()
+    # def _plot_centroids(self, ax):
+    #     markers=self.final_markers_24x24
+    #     labs=np.unique(markers[markers>0])
+    #     for lb in labs:
+    #         coords=np.argwhere(markers==lb)
+    #         if coords.size>0:
+    #             r_mean=np.mean(coords[:,0])
+    #             c_mean=np.mean(coords[:,1])
+    #             ax.plot(c_mean, r_mean, 'x', color='white', markersize=7)
+    def _plot_centroids(self, ax):
+        markers = self.final_markers_24x24
+        labs = np.unique(markers[markers > 0])
+        for lb in labs:
+            # Use the stored centroid to ensure it lies within the region.
+            cent = self.region_centroids.get(lb, None)
+            if cent is not None:
+                ax.plot(cent[1], cent[0], 'x', color='white', markersize=7)
+    def _add_legend(self, ax):
+        markers=self.final_markers_24x24
+        labs=np.unique(markers[markers>0])
+        # col_array=self.colormap(np.linspace(0,1,len(labs)))
+        handles=[]
+        for i,lab in enumerate(labs):
+            color=self.label_to_color[lab] / 255.0
+            label_str = f"R{lab}"
+            h=plt.Line2D(
+                [0],[0],
+                marker='o',
+                color='w',
+                markerfacecolor=color, # col_array[i][:3],
+                markersize=10,
+                label=label_str
+            )
+            handles.append(h)
+        ax.legend(handles=handles, loc='upper right', fontsize=8, frameon=True)
+    def _get_subtitles(self):
+        markers=self.final_markers_24x24
+        labs=np.unique(markers[markers>0])
+        suptitle_str=""
+        for i,lab in enumerate(labs):
+            prob_val=self.region_prob.get(lab,0.0)
+            visit_str=self.region_visited_str.get(lab,"0/0")
+            E_i=self.region_expected.get(lab,0.0)
+            num_found=self.region_found.get(lab,0)
+            steps_next=self.region_steps_next.get(lab,float('inf'))
+            if math.isinf(steps_next):
+                steps_str="∞"
+            else:
+                steps_str=f"{steps_next:.2f}"
+            label_str = (
+                f"R{lab}: p={prob_val:.3f}, Vis={visit_str}, "
+                f"E_tgts={E_i:.2f}, #found={num_found}, "
+                f"E_steps({num_found+1}th)={steps_str}"
+            )
+            suptitle_str+=label_str+"\n"
+        return suptitle_str
+    def incorporate_kmeans_boundaries(self, kmeans_label_map, step=0):
+        """
+        Merges the existing final watershed map (self.final_markers_24x24) with a
+        k-means label map of the same shape. Each watershed region is subdivided by
+        the boundaries from the k-means map to produce a new, finer segmentation.
+        Then we recompute region stats (probabilities, centroids, etc.) and plot a
+        2×2 figure:
+        1) Original Heatmap
+        2) Old Watershed Segmentation
+        3) k-Means Label Map
+        4) Merged Segmentation
+        """
+        # ---------------------------
+        # 1) Ensure shapes match
+        # ---------------------------
+        if kmeans_label_map.shape != self.final_markers_24x24.shape:
+            raise ValueError("kmeans_label_map must match the shape of the final watershed map.")
+        # We produce a new merged label map (24×24) subdividing each watershed region
+        # by the kmeans labels. For example, if watershed label L intersects multiple
+        # kmeans labels, that region is split into multiple subregions.
+        # We'll store it in self.merged_markers_24x24
+        self.merged_markers_24x24 = np.zeros_like(self.final_markers_24x24, dtype=np.int32)
+        watershed_labels = np.unique(self.final_markers_24x24[self.final_markers_24x24 > 0])
+        next_label = 1
+        for wlab in watershed_labels:
+            # Mask for watershed label wlab
+            wmask = (self.final_markers_24x24 == wlab)
+            # Among those pixels, we see which kmeans labels appear
+            kmeans_vals = np.unique(kmeans_label_map[wmask])
+            for klabel in kmeans_vals:
+                # Intersection => subregion
+                subregion_mask = wmask & (kmeans_label_map == klabel)
+                if not np.any(subregion_mask):
+                    continue
+                # Assign a new label
+                self.merged_markers_24x24[subregion_mask] = next_label
+                next_label += 1
+        # Optional: If you want to merge small subregions again:
+        self.merged_markers_24x24 = self._merge_small_regions(self.merged_markers_24x24, self.region_size_thresh, n_iter=1)
+        # Recompute all stats for the merged segmentation:
+        self._build_merged_outputs()
+        # Finally, plot the 2×2 figure:
+        if self.plot_img:
+            self._plot_kmeans_merge(kmeans_label_map, step)
+        return self.merged_markers_24x24
+    def _build_merged_outputs(self):
+        """
+        Re-run binomial logic on self.merged_markers_24x24:
+        1) build color overlay
+        2) scale probabilities
+        3) compute visited fraction, found targets, centroids, etc.
+        We'll store them in new variables (prefixed 'merged_') so we don't overwrite
+        the original watershed data.
+        """
+        markers = self.merged_markers_24x24
+        valid_labels = np.unique(markers[markers > 0])
+        # 1) color-labeled overlay
+        self.merged_output_color = self._color_label_map(markers)
+        self.merged_output_no_vis = self._overlay_heatmap_and_labels(self.raw_heatmap, self.merged_output_color)
+        # 2) visited overlay
+        h, w = markers.shape
+        visited_mask = np.zeros(markers.shape, dtype=bool)
+        for idx in self.flattened_visits:
+            rr = idx // w
+            cc = idx % w
+            if 0 <= rr < h and 0 <= cc < w:
+                visited_mask[rr, cc] = True
+        merged_with_vis = self.merged_output_no_vis.copy()
+        red_overlay = np.zeros_like(merged_with_vis)
+        alpha = 0.3
+        for idx in self.flattened_visits:
+            rr = idx // w
+            cc = idx % w
+            if 0 <= rr < h and 0 <= cc < w:
+                red_overlay[rr, cc] = [255, 0, 0]
+        self.merged_output_with_vis = cv2.addWeighted(merged_with_vis, 1.0, red_overlay, alpha, 0.0)
+        # 3) compute scale factor
+        label_sizes = {}
+        label_sums = {}
+        total_mass = 0.0
+        for lb in valid_labels:
+            mask = (markers == lb)
+            size_i = np.sum(mask)
+            label_sizes[lb] = size_i
+            s = np.sum(self.raw_heatmap[mask])
+            label_sums[lb] = s
+            total_mass += s
+        scale = self.N/(total_mass + 1e-9)
+        # 4) visited fraction, found count, etc.
+        merged_visited = {}
+        for lb in valid_labels:
+            merged_visited[lb] = np.sum((markers == lb) & visited_mask)
+        # found_count
+        merged_found_count = {lb:0 for lb in valid_labels}
+        for idx, num_found in zip(self.flattened_visits, self.num_found_list):
+            rr = idx // w
+            cc = idx % w
+            lb = markers[rr, cc]
+            if lb > 0 and num_found > 0:
+                merged_found_count[lb] += num_found
+        # store them
+        self.merged_region_prob = {}
+        self.merged_region_visited_str = {}
+        self.merged_region_expected = {}
+        self.merged_region_found = {}
+        self.merged_region_steps_next = {}
+        self.merged_region_centroids = {}
+        for lb in valid_labels:
+            size_i = label_sizes[lb]
+            raw_sum = label_sums[lb]
+            if size_i > 0:
+                p_i = scale*(raw_sum/size_i)
+            else:
+                p_i = 0.0
+            self.merged_region_prob[lb] = p_i
+            visited_num = merged_visited[lb]
+            self.merged_visited_num[lb]=visited_num
+            self.merged_region_visited_str[lb] = f"{visited_num}/{size_i}"
+            E_i = size_i*p_i
+            self.merged_region_expected[lb] = E_i
+            fcount = merged_found_count[lb]
+            self.merged_region_found[lb] = fcount
+            # recompute centroid for merged label
+            cent = self._find_centroid_in_region(markers, lb)
+            self.merged_region_centroids[lb] = cent
+            if E_i > 0:
+                next_steps = (fcount+1)*size_i/E_i
+            else:
+                next_steps = float('inf')
+            self.merged_region_steps_next[lb] = next_steps
+    def _plot_kmeans_merge(self, kmeans_label_map, step):
+        """
+        Plots a 2×2 figure:
+        (1) Original Heatmap
+        (2) Old Final Markers (Watershed)
+        (3) k-means Label Map
+        (4) Merged Markers
+        """
+        fig, axs = plt.subplots(2, 2, figsize=(12, 12))
+        fig.suptitle("Merging K-Means Boundaries", fontsize=16)
+        fig.text(0.5, 0.05, self._get_merged_subtitles(), ha="center", va="center", fontsize=12)
+        plt.tight_layout()
+        plt.subplots_adjust(bottom=0.1)  # Prevent overlap
+        # (1) Original heatmap
+        axs[0,0].imshow(self.input_heatmap_24x24, cmap='viridis')
+        axs[0,0].set_title("1) Original Heatmap")
+        axs[0,0].axis("off")
+        # (2) Watershed Segmentation
+        old_color = self._color_label_map(self.final_markers_24x24)
+        old_overlay = self._overlay_heatmap_and_labels(self.raw_heatmap, old_color)
+        axs[0,1].imshow(old_overlay)
+        axs[0,1].set_title("2) Watershed Segmentation")
+        axs[0,1].axis("off")
+        # -------------- NEW: Add the merged legend to the watershed subplot --------------
+        self._add_merged_legend(axs[0,1])
+        # -------------------------------------------------------------------------------
+        # (3) k-Means Label Map
+        kmeans_color = self._color_label_map(kmeans_label_map)
+        kmeans_overlay = self._overlay_heatmap_and_labels(self.raw_heatmap, kmeans_color)
+        axs[1,0].imshow(kmeans_overlay)
+        axs[1,0].set_title("3) K-Means Label Map")
+        axs[1,0].axis("off")
+        # (4) Merged Markers
+        # We already have self.merged_output_with_vis, but we'll now add a pink overlay for the path
+        # merged_with_vis = self.merged_output_with_vis.copy()
+        # Red line for path
+        axs[1,1].imshow(self.merged_output_no_vis)
+        path_rows, path_cols = [], []
+        targets = []
+        for i, idx in enumerate(self.flattened_visits):
+            rr = idx // self.merged_markers_24x24.shape[1]
+            cc = idx % self.merged_markers_24x24.shape[1]
+            path_rows.append(rr)
+            path_cols.append(cc)
+            if self.num_found_list[i] > 0:
+                targets.append((rr, cc))
+        axs[1,1].plot(path_cols, path_rows, c="r", linewidth=2)
+        axs[1,1].plot(path_cols[-1], path_rows[-1], markersize=12, zorder=99, marker="^", ls="-", c="r", mec="black")
+        axs[1,1].plot(path_cols[0], path_rows[0], 'co', c="r", markersize=8, zorder=5)
+        for target in targets:
+            axs[1,1].plot(target[1], target[0], color='g', marker='x', linestyle='-', mec="black", markersize=12, markeredgewidth=4, zorder=999)
+        axs[1,1].set_title("4) Merged Map")
+        axs[1,1].axis("off")
+        # Mark centroids for merged subregions
+        merged_labs = np.unique(self.merged_markers_24x24[self.merged_markers_24x24 > 0])
+        for lab in merged_labs:
+            cent = self.merged_region_centroids.get(lab, None)
+            if cent is not None:
+                axs[1,1].plot(cent[1], cent[0], 'x', color='white', markersize=7)
+        # Add the merged legend to the final subplot as well
+        self._add_merged_legend(axs[1,1])
+        # Save figure (same naming logic as your code)
+        if not os.path.exists(self.gifs_dir):
+            os.makedirs(self.gifs_dir)
+        plt.savefig(f'{self.gifs_dir}/kmeans_merged_{step}.png', dpi=150)
+        self.kmeans_merged_frame_files.append(f'{self.gifs_dir}/kmeans_merged_{step}.png')
+        plt.close()
+    def _add_merged_legend(self, ax):
+        """
+        Similar to _add_legend, but pulling data from self.merged_region_* variables.
+        """
+        markers = self.merged_markers_24x24
+        labs = np.unique(markers[markers>0])
+        # col_array = self.colormap(np.linspace(0,1,len(labs)))
+        handles = []
+        for i,lab in enumerate(labs):
+            color = self.label_to_color[lab] / 255.0
+            label_str = f"R{lab}"
+            h=plt.Line2D(
+                [0],[0],
+                marker='o',
+                color='w',
+                markerfacecolor=color, #col_array[i][:3],
+                markersize=10,
+                label=label_str
+            )
+            handles.append(h)
+        ax.legend(handles=handles, loc='upper right', fontsize=8, frameon=True)
+    # TODO: Merge with _get_subtitles
+    def _get_merged_subtitles(self):
+        markers=self.merged_markers_24x24
+        labs=np.unique(markers[markers>0])
+        suptitle_str=""
+        for i,lab in enumerate(labs):
+            prob_val=self.merged_region_prob.get(lab,0.0)
+            visit_str=self.merged_region_visited_str.get(lab,"0/0")
+            E_i=self.merged_region_expected.get(lab,0.0)
+            num_found=self.merged_region_found.get(lab,0)
+            steps_next=self.merged_region_steps_next.get(lab,float('inf'))
+            if math.isinf(steps_next):
+                steps_str="∞"
+            else:
+                steps_str=f"{steps_next:.2f}"
+            label_str = (
+                f"R{lab}: p={prob_val:.3f}, Vis={visit_str}, "
+                f"E_tgts={E_i:.2f}, #found={num_found}, "
+                f"E_steps({num_found+1}th)={steps_str}"
+            )
+            suptitle_str+=label_str+"\n"
+        return suptitle_str
+    # ------ OTHER HELPER FUNCTIONS -------#
+    def get_label_id(self, patch_idx):
+        """
+        Given a flattened index (row-major order) within the 24×24 grid,
+        return the average probability of the corresponding merged region.
+        :param patch_idx: Flattened index (0 to H×W-1).
+        :return: The region-average probability, or 0.0 if the patch
+                is out of bounds or not in a labeled region.
+        """
+        # Guard: check we have a merged label map
+        if self.merged_markers_24x24 is None:
+            raise ValueError("Merged markers do not exist. "
+                            "Please call `incorporate_kmeans_boundaries(...)` first.")
+        h, w = self.merged_markers_24x24.shape
+        # Derive row, col
+        row = patch_idx // w
+        col = patch_idx % w
+        if row < 0 or row >= h or col < 0 or col >= w:
+            return 0.0
+        # Find which merged label region that pixel belongs to
+        label_id = self.merged_markers_24x24[row, col]
+        return label_id
+# ----------------------------
+# Example MAIN usage
+# ----------------------------
+if __name__=="__main__":
+    # 1) Load a 24x24 heatmap
+    file_path="./expt/seg_mask_step0_v1.npy"
+    heatmap_24x24=np.load(file_path)
+    heatmap_24x24=np.clip(heatmap_24x24,0,100)
+    # 2) Simulate a random walk
+    max_r, max_c=heatmap_24x24.shape[0]-1, heatmap_24x24.shape[1]-1
+    steps=100
+    current_pos=(0,0)
+    moves_8=[(-1,-1),(-1,0),(-1,1),
+             ( 0,-1),        ( 0,1),
+             ( 1,-1),( 1,0), ( 1,1)]
+    path_coords=[current_pos]
+    num_found_list=[0]
+    for _ in range(steps):
+        r,c=current_pos
+        dr,dc=random.choice(moves_8)
+        nr, nc=r+dr,c+dc
+        if 0<=nr<=max_r and 0<=nc<=max_c:
+            current_pos=(nr,nc)
+        path_coords.append(current_pos)
+        # 10% chance for new target
+        # num_found_list.append(random.random()<0.1)
+        num_found_list.append(0)
+    num_found_list[0]=1
+    num_found_list[1]=2
+    num_found_list[2]=3
+    # Flatten
+    width=heatmap_24x24.shape[1]
+    flattened_visits=[]
+    for (rr,cc) in path_coords:
+        idx=rr*width+cc
+        flattened_visits.append(idx)
+    # 3) Instantiate & run
+    segmenter=WatershedBinomial(
+        N=10.0,  # total expected # across map
+        blur_kernel=(5,5),
+        sure_fg_factor=0.5,
+        dilation_kernel_size=(2,2),
+        dilation_iters=3,
+        region_size_thresh=9,
+        plot_img=True,
+        gifs_dir="/home/user/VLM-Search/inference/test_results/gifs",
+        colormap=plt.cm.tab20,
+        title="Watershed + Binomial Steps"
+    )
+    final=segmenter.run(heatmap_24x24, flattened_visits, num_found_list, step=0)
+    # Suppose you load a kmeans_label_map in the same shape (24×24)
+    file_path="./expt/smoothed_labels_2d.npy"
+    kmeans_label_map = np.load(file_path)
+    # Now incorporate it into the final segmentation:
+    segmenter.incorporate_kmeans_boundaries(kmeans_label_map, step=0)

Taxabind/Taxabind/SoundBind/config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from easydict import EasyDict as edict
+config = edict()
+config.train_df = '/mnt/hdd/inat2021_ds/sound_train/sound_image_pairs_filtered.csv' # 'train_df.json'
+config.val_df= '/mnt/hdd/inat2021_ds/sound_val/sound_image_pairs_filtered.csv' # 'val_df.csv'
+config.data_path = '/mnt/hdd/inat2021_ds/sound_train'
+# HOW ABOUT VAL?
+# # Toy Example
+# config.train_df = '../scripts/preprocess/expt/sound_train/sound_image_pairs_filtered.csv' # 'train_df.json'
+# config.val_df= '../scripts/preprocess/expt/sound_train/sound_image_pairs_filtered.csv' # 'val_df.csv'
+# config.data_path = '../scripts/preprocess/expt/sound_train'
+config.batch_size = 256
+config.lr = 1e-4
+config.accumulate_grad_batches = 8
+config.max_epochs = 20
+config.num_workers = 16
+config.devices = 2
+config.val_check_interval = 0.5
+config.save_dir = 'checkpoints'
+config.filename = 'soundbind-{epoch:02d}-{val_loss:.2f}'
+config.locked_tuning = True
+# TEMP
+config.sat_encoder = 'openai/clip-vit-large-patch14-336' # openai/clip-vit-base-patch16, openai/clip-vit-large-patch14-336
+config.patch_size = 14

Taxabind/Taxabind/SoundBind/dataloader.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from torch.utils.data import Dataset
+from torchvision import transforms
+import os
+from PIL import Image
+import pandas as pd
+from config import config
+from sound_encoder import get_audio_clap
+import torch
+from tqdm import tqdm
+#img, sound, year, month, day
+class INatDataset(Dataset):
+    def __init__(self,
+                 data_file,
+                 mode='train'):
+        self.data_file = pd.read_csv(data_file)
+        if mode=='train':
+            self.transform = transforms.Compose([
+                    transforms.Resize((256, 256)),
+                    transforms.RandomCrop((224, 224)),
+                    transforms.RandomHorizontalFlip(0.5),
+                    transforms.GaussianBlur(5, (0.01, 1.0)),
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                    std=[0.229, 0.224, 0.225])
+            ])
+        else:
+            self.transform = transforms.Compose([
+                    transforms.Resize((256, 256)),
+                    transforms.CenterCrop((224, 224)),
+                    transforms.ToTensor(),
+                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                    std=[0.229, 0.224, 0.225])
+            ])
+        self.species_text = self.data_file['scientific_name'].tolist()
+        self.species_classes = list(set(self.species_text))
+    def __len__(self):
+        return len(self.data_file)
+    def get_sample(self,idx):
+        sample = self.data_file.iloc[idx]
+        id = sample.id
+        sound_format = sample.sound_format
+        image_path = os.path.join(config['data_path'],"images",str(id)+".jpg")
+        sound_path = os.path.join(config['data_path'],"sounds_mp3",str(id)+"."+'mp3')
+        sound = get_audio_clap(sound_path) # , sound_format)
+        for k in sound.keys():
+            sound[k] = sound[k].squeeze(0)
+        image = self.transform(Image.open(image_path))
+        return image, sound
+    def __getitem__(self, idx):
+        image, sound = self.get_sample(idx)
+        return image, sound, self.species_classes.index(self.data_file.iloc[idx]['scientific_name'])

Taxabind/Taxabind/SoundBind/model.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import sys
+# Ensure the correct directory is at the front of sys.path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import open_clip
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from sound_encoder import CLAP_audiomodel_withProjection as AudioEncoder
+import numpy as np
+from torch.utils.data import DataLoader
+from config import config
+import random
+from dataloader import INatDataset
+from pytorch_lightning.callbacks import ModelCheckpoint
+def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    audio_loss = contrastive_loss(similarity)
+    ground_img_loss = contrastive_loss(similarity.t())
+    return 0.5*audio_loss + 0.5*ground_img_loss
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits[:logits.shape[1]], torch.arange(logits.shape[1], device=logits.device))
+class AudioBind(pl.LightningModule):
+    def __init__(self, train_dataset, val_dataset, **kwargs):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.model, *_ = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
+        if config.locked_tuning:
+            for param in self.model.parameters():
+                param.requires_grad = False
+        self.audio_encoder = AudioEncoder(freeze=False)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.batch_size = kwargs.get('batch_size')
+        self.num_workers = kwargs.get('num_workers')
+        self.lr = kwargs.get('lr', 1e-4)
+    def forward(self, image, audio):
+        with torch.no_grad():
+            image_embeds, *_ = self.model(image)
+        unnormalized_audio_embeds = self.audio_encoder(audio)
+        audio_embeds = torch.nn.functional.normalize(unnormalized_audio_embeds, dim=-1)
+        return image_embeds, audio_embeds
+    def shared_step(self, batch):
+        image, audio, *_ = batch
+        image_embeds, audio_embeds = self(image, audio)
+        logit_scale = self.logit_scale.exp()
+        logits_per_img = torch.matmul(image_embeds,audio_embeds.t())*logit_scale
+        cross_contrastive_loss = clip_loss(logits_per_img)
+        return cross_contrastive_loss
+    def training_step(self, batch, batch_idx):
+        loss = self.shared_step(batch)
+        self.log('train_loss', loss, sync_dist=True, prog_bar=True, on_epoch=True, batch_size=self.batch_size)
+        self.log('temperature', self.logit_scale.data, prog_bar=True, on_epoch=True, batch_size=self.batch_size)
+        return loss
+    def on_train_batch_end(self,outputs,batch, batch_idx):
+        if self.logit_scale.data > np.log(100):
+            self.logit_scale.data = torch.clamp(self.logit_scale.data, 0, np.log(100))
+    def validation_step(self, batch, batch_idx):
+        loss = self.shared_step(batch)
+        self.log('val_loss', loss, sync_dist=True, prog_bar=True, on_epoch=True, batch_size=self.batch_size)
+        return loss
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,
+                          batch_size=self.batch_size,
+                          num_workers=self.num_workers,
+                          shuffle=True,
+                          persistent_workers=False)
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset,
+                          batch_size=self.batch_size,
+                          num_workers=self.num_workers,
+                          shuffle=False,
+                          persistent_workers=False)
+    def configure_optimizers(self):
+        params = self.parameters()
+        self.optim = torch.optim.AdamW(params,
+                                       lr=self.lr,
+                                       betas=(0.9,0.98),
+                                       eps=1e-6
+                                    )
+        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+            optimizer=self.optim,
+            T_0=20
+        )
+        return [self.optim], [self.scheduler]
+def seed_everything(seed=42):
+    """
+    seed: int
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PYTHONHASHSEED"] = str(seed)
+if __name__=='__main__':
+    import warnings
+    warnings.filterwarnings("ignore")
+    torch.set_warn_always(False)
+    seed_everything()
+    train_dataset = INatDataset(data_file=config.train_df, mode='train')
+    val_dataset = INatDataset(data_file=config.val_df, mode='val')
+    kwargs = {'batch_size':config.batch_size, 'num_workers': config.num_workers}
+    model = AudioBind(train_dataset, val_dataset, **kwargs)
+    torch.cuda.empty_cache()
+    checkpoint = ModelCheckpoint(
+        monitor='val_loss',
+        dirpath=config.save_dir,
+        filename=config.filename,
+        mode='min',
+        save_top_k=3
+    )
+    trainer = pl.Trainer(
+        accelerator='gpu',
+        devices=config.devices,
+        strategy='ddp',
+        max_epochs=config.max_epochs,
+        num_nodes=1,
+        callbacks=[checkpoint],
+        accumulate_grad_batches=config.accumulate_grad_batches,
+        log_every_n_steps=1
+        )
+    trainer.fit(model)

Taxabind/Taxabind/SoundBind/sound_encoder.py ADDED Viewed

	@@ -0,0 +1,45 @@

+#Hugging face way of loading AudioCLAP model
+from transformers import ClapProcessor
+from transformers import ClapAudioModelWithProjection
+import pytorch_lightning as pl
+import torch.nn as nn
+import torch
+import numpy as np
+import torchaudio
+import os
+processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+SAMPLE_RATE = 48000
+def get_audio_clap(path_to_audio,format="mp3",padding="repeatpad",truncation="fusion"):
+    track, sr = torchaudio.load(path_to_audio, format=format)  # torchaudio.load(path_to_audio)
+    track = track.mean(axis=0)
+    track = torchaudio.functional.resample(track, orig_freq=sr, new_freq=SAMPLE_RATE)
+    output = processor(audios=track, sampling_rate=SAMPLE_RATE, max_length_s=10, return_tensors="pt",padding=padding,truncation=truncation)
+    return output
+class CLAP_audiomodel_withProjection(pl.LightningModule):
+    def __init__(self,freeze=False):
+        super().__init__()
+        if freeze:
+            self.model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused").eval()
+            for params in self.model.parameters():
+                params.requires_grad=False
+        else:
+            self.model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused").train()
+    def forward(self,audio):
+        batch_embeddings_audio = self.model(**audio)['audio_embeds']
+        return batch_embeddings_audio
+if __name__ == '__main__':
+    path_to_audio ="/mnt/hdd/inat2021_ds/sound_train/sounds_mp3/165878447.mp3"
+    sample =  get_audio_clap(path_to_audio)
+    print(sample.keys())
+    sample['input_features'] = torch.concat([sample['input_features'],sample['input_features']],axis=0)
+    sample['is_longer'] = torch.concat([sample['is_longer'],sample['is_longer']],axis=0)
+    print(sample['input_features'].shape,sample['is_longer'].shape) #torch.Size([2, 4, 1001, 64]), torch.Size([2, 1])
+    model = CLAP_audiomodel_withProjection(freeze=False)
+    audio_feat = model(sample)
+    print(audio_feat.shape) #torch.Size([2, 512])

app.py CHANGED Viewed

@@ -1,222 +1,123 @@
 """
-Search-TTA demo
 """
 # ────────────────────────── imports ───────────────────────────────────
-import cv2
 import gradio as gr
 import torch
-import numpy as np
 from PIL import Image
-import matplotlib.pyplot as plt
-import io
-import torchaudio
-import spaces   # integration with ZeroGPU on hf
-from torchvision import transforms
-import open_clip
-from clip_vision_per_patch_model import CLIPVisionPerPatchModel
-from transformers import ClapAudioModelWithProjection
-from transformers import ClapProcessor
-# ────────────────────────── global config & models ────────────────────
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# BioCLIP (ground-image & text encoder)
-bio_model, _, _ = open_clip.create_model_and_transforms("hf-hub:imageomics/bioclip")
-bio_model = bio_model.to(device).eval()
-bio_tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")
-# Satellite patch encoder CLIP-L-336 per-patch)
-sat_model: CLIPVisionPerPatchModel = (
-    CLIPVisionPerPatchModel.from_pretrained("derektan95/search-tta-sat")
-    .to(device)
-    .eval()
-)
-# Sound CLAP model
-sound_model: ClapAudioModelWithProjection = (
-    ClapAudioModelWithProjection.from_pretrained("derektan95/search-tta-sound")
-    .to(device)
-    .eval()
-)
-sound_processor: ClapProcessor = ClapProcessor.from_pretrained("derektan95/search-tta-sound")
-SAMPLE_RATE = 48000
-logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-logit_scale = logit_scale.exp()
-blur_kernel = (5,5)
-# ────────────────────────── transforms (exact spec) ───────────────────
-img_transform = transforms.Compose(
-    [
-        transforms.Resize((256, 256)),
-        transforms.CenterCrop((224, 224)),
-        transforms.ToTensor(),
-        transforms.Normalize(
-            mean=[0.485, 0.456, 0.406],
-            std=[0.229, 0.224, 0.225],
-        ),
-    ]
-)
-imo_transform = transforms.Compose(
-    [
-        transforms.Resize((336, 336)),
-        transforms.ToTensor(),
-        transforms.Normalize(
-            mean=[0.485, 0.456, 0.406],
-            std=[0.229, 0.224, 0.225],
-        ),
-    ]
 )
-def get_audio_clap(path_to_audio,format="mp3",padding="repeatpad",truncation="fusion"):
-    track, sr = torchaudio.load(path_to_audio, format=format)  # torchaudio.load(path_to_audio)
-    track = track.mean(axis=0)
-    track = torchaudio.functional.resample(track, orig_freq=sr, new_freq=SAMPLE_RATE)
-    output = sound_processor(audios=track, sampling_rate=SAMPLE_RATE, max_length_s=10, return_tensors="pt",padding=padding,truncation=truncation)
-    return output
-# ────────────────────────── helpers ───────────────────────────────────
-@torch.no_grad()
-def _encode_ground(img_pil: Image.Image) -> torch.Tensor:
-    img = img_transform(img_pil).unsqueeze(0).to(device)
-    img_embeds, *_ = bio_model(img)
-    return img_embeds
-@torch.no_grad()
-def _encode_text(text: str) -> torch.Tensor:
-    toks = bio_tokenizer(text).to(device)
-    _, txt_embeds, _ = bio_model(text=toks)
-    return txt_embeds
-@torch.no_grad()
-def _encode_sat(img_pil: Image.Image) -> torch.Tensor:
-    imo = imo_transform(img_pil).unsqueeze(0).to(device)
-    imo_embeds = sat_model(imo)
-    return imo_embeds
-@torch.no_grad()
-def _encode_sound(sound) -> torch.Tensor:
-    processed_sound = get_audio_clap(sound)
-    for k in processed_sound.keys():
-        processed_sound[k] = processed_sound[k].to(device)
-    unnormalized_audio_embeds = sound_model(**processed_sound).audio_embeds
-    sound_embeds = torch.nn.functional.normalize(unnormalized_audio_embeds, dim=-1)
-    return sound_embeds
-def _similarity_heatmap(query: torch.Tensor, patches: torch.Tensor) -> np.ndarray:
-    sims = torch.matmul(query, patches.t()) * logit_scale
-    sims = sims.t().sigmoid()
-    sims = sims[1:].squeeze()  # drop CLS token
-    side = int(np.sqrt(len(sims)))
-    sims = sims.reshape(side, side)
-    return sims.cpu().detach().numpy()
-def _array_to_pil(arr: np.ndarray) -> Image.Image:
-    """
-    Render arr with viridis, automatically stretching its own min→max to 0→1
-    so that the most-similar patches appear yellow.
-    """
-    # Gausian Smoothing
-    if blur_kernel != (0,0):
-        arr = cv2.GaussianBlur(arr, blur_kernel, 0)
-    # --- contrast-stretch to local 0-1 range --------------------------
-    arr_min, arr_max = float(arr.min()), float(arr.max())
-    if arr_max - arr_min < 1e-6:        # avoid /0 when the heat-map is flat
-        arr_scaled = np.zeros_like(arr)
-    else:
-        arr_scaled = (arr - arr_min) / (arr_max - arr_min)
-    # ------------------------------------------------------------------
-    fig, ax = plt.subplots(figsize=(2.6, 2.6), dpi=96)
-    ax.imshow(arr_scaled, cmap="viridis", vmin=0.0, vmax=1.0)
-    ax.axis("off")
-    buf = io.BytesIO()
-    plt.tight_layout(pad=0)
-    fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
-    plt.close(fig)
-    buf.seek(0)
-    return Image.open(buf)
-# ────────────────────────── main inference ────────────────────────────
-# integration with ZeroGPU on hf
-@spaces.GPU
 def process(
-    sat_img: Image.Image,
-    taxonomy: str,
     ground_img: Image.Image | None,
-    sound: torch.Tensor | None,
 ):
-    if sat_img is None:
-        return None, None
-    patches = _encode_sat(sat_img)
-    heat_ground, heat_text, heat_sound = None, None, None
-    if ground_img is not None:
-        q_img = _encode_ground(ground_img)
-        heat_ground = _array_to_pil(_similarity_heatmap(q_img, patches))
-    if taxonomy.strip():
-        q_txt = _encode_text(taxonomy.strip())
-        heat_text = _array_to_pil(_similarity_heatmap(q_txt, patches))
-    if sound is not None:
-        q_sound = _encode_sound(sound)
-        heat_sound = _array_to_pil(_similarity_heatmap(q_sound, patches))
-    return heat_ground, heat_text, heat_sound
 # ────────────────────────── Gradio UI ─────────────────────────────────
-with gr.Blocks(title="Search-TTA", theme=gr.themes.Base()) as demo:
-    with gr.Row():
-        gr.Markdown(
         """
-        <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
-          <div>
-            <h1>Search-TTA: A Multimodal Test-Time Adaptation Framework for Visual Search in the Wild</h1>
-            <span></span>
-            <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
-            <a href="https://search-tta.github.io">Project Website</a>
-            </h2>
-            <span></span>
-            <h2 style='font-weight: 450; font-size: 0.5rem; margin: 0rem'>[Work in Progress]</h2>
-          </div>
-        </div>
         """
-        # <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>WACV 2025</h2>
-        #     <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
-        #     <a href="https://derektan95.github.io">Derek M. S. Tan</a>,
-        #     <a href="https://chinchinati.github.io/">Shailesh</a>,
-        #     <a href="https://www.linkedin.com/in/boyang-liu-nus">Boyang Liu</a>,
-        #     <a href="https://www.linkedin.com/in/loki-silvres">Alok Raj</a>,
-        #     <a href="https://www.linkedin.com/in/ang-qi-xuan-714347142">Qi Xuan Ang</a>,
-        #     <a href="https://weihengdai.top">Weiheng Dai</a>,
-        #     <a href="https://www.linkedin.com/in/tanishqduhan">Tanishq Duhan</a>,
-        #     <a href="https://www.linkedin.com/in/jimmychiun">Jimmy Chiun</a>,
-        #     <a href="https://www.yuhongcao.online/">Yuhong Cao</a>,
-        #     <a href="https://www.cs.toronto.edu/~florian/">Florian Shkurti</a>,
-        #     <a href="https://www.marmotlab.org/bio.html">Guillaume Sartoretti</a>
-        # </h2>
-        # <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>National University of Singapore, University of Toronto, IIT-Dhanbad, Singapore Technologies Engineering</h2>
-        )
     with gr.Row(variant="panel"):
-        # LEFT COLUMN  (satellite, taxonomy, run)
         with gr.Column():
             sat_input = gr.Image(
                 label="Satellite Image",
@@ -228,54 +129,21 @@ with gr.Blocks(title="Search-TTA", theme=gr.themes.Base()) as demo:
                 label="Full Taxonomy Name (optional)",
                 placeholder="e.g. Animalia Chordata Mammalia Rodentia Sciuridae Marmota marmota",
             )
-            # ─── NEW: sound input ───────────────────────────
-            sound_input = gr.Audio(
-                label="Sound Input (optional)",
-                sources=["upload"],     # or "microphone" / "url" as you prefer
-                type="filepath",     # or "numpy" if you want raw arrays
-            )
-            run_btn = gr.Button("Run", variant="primary")
-        # RIGHT COLUMN  (ground image + two heat-maps)
-        with gr.Column():
             ground_input = gr.Image(
                 label="Ground-level Image (optional)",
                 sources=["upload"],
                 type="pil",
                 height=320,
             )
-            gr.Markdown("### Heat-map Results")
-            with gr.Row():
-                # Separate label and image to avoid overlap
-                with gr.Column(scale=1, min_width=100):
-                    gr.Markdown("**Ground Image Query**", elem_id="label-ground")
-                    heat_ground_out = gr.Image(
-                        show_label=False,
-                        height=160,
-                        # width=160,
-                    )
-                with gr.Column(scale=1, min_width=100):
-                    gr.Markdown("**Text Query**", elem_id="label-text")
-                    heat_text_out = gr.Image(
-                        show_label=False,
-                        height=160,
-                        # width=160,
-                    )
-                with gr.Column(scale=1, min_width=100):
-                    gr.Markdown("**Sound Query**", elem_id="label-sound")
-                    heat_sound_out = gr.Image(
-                        show_label=False,
-                        height=160,
-                        # width=160,
-                    )
-            # ─── NEW: sound output ─────────────────────────
-            # sound_output = gr.Audio(
-            #     label="Playback",
-            # )
-    # EXAMPLES
     with gr.Row():
         gr.Markdown("### In-Domain Taxonomy")
     with gr.Row():
@@ -285,40 +153,34 @@ with gr.Blocks(title="Search-TTA", theme=gr.themes.Base()) as demo:
                     "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/80645_39.76079_-74.10316.jpg",
                     "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/cc1ebaf9-899d-49f2-81c8-d452249a8087.jpg",
                     "Animalia Chordata Aves Charadriiformes Laridae Larus marinus",
-                    "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/89758229.mp3"
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/28871_-12.80255_-69.29999.jpg",
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/1b8064f8-7deb-4b30-98cd-69da98ba6a3d.jpg",
                     "Animalia Chordata Mammalia Rodentia Caviidae Hydrochoerus hydrochaeris",
-                    "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/166631961.mp3"
                 ],
                 [
                     "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/277303_38.72364_-75.07749.jpg",
                     "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/0b9cc264-a2ba-44bd-8e41-0d01a6edd1e8.jpg",
                     "Animalia Arthropoda Malacostraca Decapoda Ocypodidae Ocypode quadrata",
-                    "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/12372063.mp3"
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/388246_45.49036_7.14796.jpg",
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/327e1f07-692b-4140-8a3e-bd098bc064ff.jpg",
                     "Animalia Chordata Mammalia Rodentia Sciuridae Marmota marmota",
-                    "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/59677071.mp3"
                 ],
                 [
                     "examples/Animalia_Chordata_Reptilia_Squamata_Varanidae_Varanus_salvator/410613_5.35573_100.28948.jpg",
                     "examples/Animalia_Chordata_Reptilia_Squamata_Varanidae_Varanus_salvator/461d8e6c-0e66-4acc-8ecd-bfd9c218bc14.jpg",
                     "Animalia Chordata Reptilia Squamata Varanidae Varanus salvator",
-                    None
                 ],
             ],
-            inputs=[sat_input, ground_input, taxonomy_input, sound_input],
-            outputs=[heat_ground_out, heat_text_out, heat_sound_out],
-            fn=process,
             cache_examples=False,
         )
-    # EXAMPLES
     with gr.Row():
         gr.Markdown("### Out-Domain Taxonomy")
     with gr.Row():
@@ -328,48 +190,45 @@ with gr.Blocks(title="Search-TTA", theme=gr.themes.Base()) as demo:
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/27423_35.64005_-121.17595.jpg",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/3aac526d-c921-452a-af6a-cb4f2f52e2c4.jpg",
                     "Animalia Chordata Mammalia Carnivora Phocidae Mirounga angustirostris",
-                    "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/3123948.mp3"
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/1528408_13.00422_80.23033.jpg",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/37faabd2-a613-4461-b27e-82fe5955ecaf.jpg",
                     "Animalia Chordata Mammalia Carnivora Canidae Canis aureus",
-                    "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/189318716.mp3"
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Ursidae_Ursus_americanus/yosemite_v3_resized.png",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Ursidae_Ursus_americanus/248820933.jpeg",
                     "Animalia Chordata Mammalia Carnivora Ursidae Ursus americanus",
-                    None
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Urocyon_littoralis/304160_34.0144_-119.54417.jpg",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Urocyon_littoralis/0cbdfbf2-6cfe-4d61-9602-c949f24d0293.jpg",
                     "Animalia Chordata Mammalia Carnivora Canidae Urocyon littoralis",
-                    None
                 ],
             ],
-            inputs=[sat_input, ground_input, taxonomy_input, sound_input],
-            outputs=[heat_ground_out, heat_text_out, heat_sound_out],
-            fn=process,
             cache_examples=False,
         )
-    # CALLBACK
     run_btn.click(
         fn=process,
-        inputs=[sat_input, taxonomy_input, ground_input, sound_input],
-        outputs=[heat_ground_out, heat_text_out, heat_sound_out],
     )
-    # Footer to point out to model and data from app page.
-    gr.Markdown(
-        """
-        The satellite image CLIP encoder is fine-tuned using [Sentinel-2 Level 2A](https://docs.sentinel-hub.com/api/latest/data/sentinel-2-l2a/) satellite image and taxonomy images (with GPS locations) from [iNaturalist](https://inaturalist.org/). The sound CLIP encoder is fine-tuned with a subset of the same taxonomy images and their corresponding sounds from [iNaturalist](https://inaturalist.org/). Note that while some of the examples above result in poor probability distributions, they will be improved using our test-time adaptation framework during the search process.
-        """
-    )
-# LAUNCH
 if __name__ == "__main__":
     demo.queue(max_size=15)
     demo.launch(share=True)

 """
+Simplified Gradio demo for Search-TTA evaluation.
+This version mirrors the layout of `app_BACKUP.py` but:
+1. Loads no OpenCLIP / CLAP / Satellite encoders at import-time.
+2. Keeps only the Satellite and Ground-level image inputs.
+3. Exposes the high-level wrapper classes `ClipSegTTA` and
+`TestWorker` and calls `TestWorker.run_episode` inside the
+`process` callback.
 """
 # ────────────────────────── imports ───────────────────────────────────
+from pathlib import Path
 import gradio as gr
 import torch
 from PIL import Image
+# Import configuration & RL / TTA utilities -------------------------------------------------
+# NOTE: we import * so that the global names (e.g. USE_GPU, MODEL_NAME, etc.)
+#       are available exactly as referenced later in the unchanged snippet.
+from test_parameter import *          # noqa: F403, F401  (wild-import is intentional here)
+from model import PolicyNet           # noqa: E402 – after wild import on purpose
+from test_multi_robot_worker import TestWorker  # noqa: E402
+# from Taxabind.TaxaBind.SatBind.clip_seg_tta import ClipSegTTA   # noqa: E402
+from Taxabind.Taxabind.SatBind.clip_seg_tta import ClipSegTTA
+# CHANGE ME!
+currEpisode = 0
+# Prepare the model
+# device = torch.device('cpu') #if USE_GPU_TRAINING else torch.device('cpu')
+device = torch.device('cuda') if USE_GPU else torch.device('cpu')
+policy_net = PolicyNet(INPUT_DIM, EMBEDDING_DIM).to(device)
+# script_dir = os.path.dirname(os.path.abspath(__file__))
+script_dir = Path(__file__).resolve().parent
+print("real_script_dir: ", script_dir)
+# checkpoint = torch.load(f'{script_dir}/modules/vlm_search/{model_path}/{MODEL_NAME}')
+checkpoint = torch.load(f'{model_path}/{MODEL_NAME}')
+policy_net.load_state_dict(checkpoint['policy_model'])
+print('Model loaded!')
+# print(next(policy_net.parameters()).device)
+# Init Taxabind here (only need to init once)
+if TAXABIND_TTA:
+    # self.clip_seg_tta = None
+    clip_seg_tta = ClipSegTTA(
+        img_dir=TAXABIND_IMG_DIR,
+        imo_dir=TAXABIND_IMO_DIR,
+        json_path=TAXABIND_INAT_JSON_PATH,
+        sat_to_img_ids_json_path=TAXABIND_SAT_TO_IMG_IDS_JSON_PATH,
+        patch_size=TAXABIND_PATCH_SIZE,
+        sat_checkpoint_path=TAXABIND_SAT_CHECKPOINT_PATH,
+        sample_index = 0,   # Set using 'reset' in worker
+        blur_kernel = TAXABIND_GAUSSIAN_BLUR_KERNEL,
+        device=device,
+        sat_to_img_ids_json_is_train_dict=False, # for search ds val
+        tax_to_filter_val=QUERY_TAX,
+        load_model=USE_CLIP_PREDS,
+        initial_modality=INITIAL_MODALITY,
+        sound_data_path = TAXABIND_SOUND_DATA_PATH,
+        sound_checkpoint_path=TAXABIND_SOUND_CHECKPOINT_PATH,
+        # sat_filtered_json_path=TAXABIND_FILTERED_INAT_JSON_PATH,
+    )
+    print("ClipSegTTA Loaded!")
+else:
+    clip_seg_tta = None
+# Define TestWorker
+planner = TestWorker(
+    meta_agent_id=0,
+    n_agent=1,
+    policy_net=policy_net,
+    global_step=3,
+    device='cuda',
+    greedy=True,
+    save_image=SAVE_GIFS,
+    clip_seg_tta=clip_seg_tta
 )
+# ────────────────────────── Gradio process fn ─────────────────────────
 def process(
+    sat_img: Image.Image | None,
     ground_img: Image.Image | None,
+    taxonomy: str | None = None,
 ):
+    """Callback executed when the user presses **Run** in the UI.
+    At test-time we simply trigger the RL search episode via
+    ``planner.run_episode`` and return its performance metrics.
+    The image inputs are currently *not* used directly here but are
+    retained to conform to the requested interface.
+    """
+    # If no satellite image is provided we bail out early.
+    if sat_img is None:
+        return {"error": "Please provide a satellite image."}
+    # Optionally you may want to reset episode index or make it configurable.
+    # For now we hard-code episode 0, mirroring the snippet.
+    planner.run_episode(currEpisode)
+    print("planner.perf_metrics: ", planner.perf_metrics)
+    # Return the collected performance metrics so they can be inspected.
+    return planner.perf_metrics
 # ────────────────────────── Gradio UI ─────────────────────────────────
+with gr.Blocks(title="Search-TTA (Simplified)", theme=gr.themes.Base()) as demo:
+    gr.Markdown(
         """
+        # Search-TTA – Simplified Demo
+        **Satellite ↔ Ground-level Visual Search** via RL Test-Time Adaptation.
         """
+    )
     with gr.Row(variant="panel"):
         with gr.Column():
             sat_input = gr.Image(
                 label="Satellite Image",
                 label="Full Taxonomy Name (optional)",
                 placeholder="e.g. Animalia Chordata Mammalia Rodentia Sciuridae Marmota marmota",
             )
             ground_input = gr.Image(
                 label="Ground-level Image (optional)",
                 sources=["upload"],
                 type="pil",
                 height=320,
             )
+            run_btn = gr.Button("Run", variant="primary")
+        with gr.Column():
+            gr.Markdown("### Episode Metrics")
+            metrics_out = gr.JSON(label="Performance Metrics")
+    # Bind callback
+    # EXAMPLES – copied from original demo (satellite, ground, taxonomy only)
     with gr.Row():
         gr.Markdown("### In-Domain Taxonomy")
     with gr.Row():
                     "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/80645_39.76079_-74.10316.jpg",
                     "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/cc1ebaf9-899d-49f2-81c8-d452249a8087.jpg",
                     "Animalia Chordata Aves Charadriiformes Laridae Larus marinus",
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/28871_-12.80255_-69.29999.jpg",
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/1b8064f8-7deb-4b30-98cd-69da98ba6a3d.jpg",
                     "Animalia Chordata Mammalia Rodentia Caviidae Hydrochoerus hydrochaeris",
                 ],
                 [
                     "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/277303_38.72364_-75.07749.jpg",
                     "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/0b9cc264-a2ba-44bd-8e41-0d01a6edd1e8.jpg",
                     "Animalia Arthropoda Malacostraca Decapoda Ocypodidae Ocypode quadrata",
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/388246_45.49036_7.14796.jpg",
                     "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/327e1f07-692b-4140-8a3e-bd098bc064ff.jpg",
                     "Animalia Chordata Mammalia Rodentia Sciuridae Marmota marmota",
                 ],
                 [
                     "examples/Animalia_Chordata_Reptilia_Squamata_Varanidae_Varanus_salvator/410613_5.35573_100.28948.jpg",
                     "examples/Animalia_Chordata_Reptilia_Squamata_Varanidae_Varanus_salvator/461d8e6c-0e66-4acc-8ecd-bfd9c218bc14.jpg",
                     "Animalia Chordata Reptilia Squamata Varanidae Varanus salvator",
                 ],
             ],
+            inputs=[sat_input, ground_input, taxonomy_input],
+            outputs=[metrics_out],
+            fn=lambda sat, grd, tax: process(sat, grd),
             cache_examples=False,
         )
     with gr.Row():
         gr.Markdown("### Out-Domain Taxonomy")
     with gr.Row():
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/27423_35.64005_-121.17595.jpg",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/3aac526d-c921-452a-af6a-cb4f2f52e2c4.jpg",
                     "Animalia Chordata Mammalia Carnivora Phocidae Mirounga angustirostris",
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/1528408_13.00422_80.23033.jpg",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/37faabd2-a613-4461-b27e-82fe5955ecaf.jpg",
                     "Animalia Chordata Mammalia Carnivora Canidae Canis aureus",
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Ursidae_Ursus_americanus/yosemite_v3_resized.png",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Ursidae_Ursus_americanus/248820933.jpeg",
                     "Animalia Chordata Mammalia Carnivora Ursidae Ursus americanus",
                 ],
                 [
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Urocyon_littoralis/304160_34.0144_-119.54417.jpg",
                     "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Urocyon_littoralis/0cbdfbf2-6cfe-4d61-9602-c949f24d0293.jpg",
                     "Animalia Chordata Mammalia Carnivora Canidae Urocyon littoralis",
                 ],
             ],
+            inputs=[sat_input, ground_input, taxonomy_input],
+            outputs=[metrics_out],
+            fn=lambda sat, grd, tax: process(sat, grd),
             cache_examples=False,
         )
     run_btn.click(
         fn=process,
+        inputs=[sat_input, ground_input, taxonomy_input],
+        outputs=metrics_out,
     )
+# ────────────────────────── unchanged worker initialisation ───────────
+# NOTE: **Do NOT modify the code below.** It is copied verbatim from the
+#       user-provided snippet so that the exact same objects are created.
+#       The variables referenced here come from `test_parameter` which we
+#       imported with a wildcard earlier.
+# if def main
 if __name__ == "__main__":
+    # Finally launch the Gradio interface (queue for concurrency).
     demo.queue(max_size=15)
     demo.launch(share=True)

app_BACKUP.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+Search-TTA demo
+"""
+# ────────────────────────── imports ───────────────────────────────────
+import cv2
+import gradio as gr
+import torch
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+import io
+import torchaudio
+import spaces   # integration with ZeroGPU on hf
+from torchvision import transforms
+import open_clip
+from clip_vision_per_patch_model import CLIPVisionPerPatchModel
+from transformers import ClapAudioModelWithProjection
+from transformers import ClapProcessor
+# ────────────────────────── global config & models ────────────────────
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# BioCLIP (ground-image & text encoder)
+bio_model, _, _ = open_clip.create_model_and_transforms("hf-hub:imageomics/bioclip")
+bio_model = bio_model.to(device).eval()
+bio_tokenizer = open_clip.get_tokenizer("hf-hub:imageomics/bioclip")
+# Satellite patch encoder CLIP-L-336 per-patch)
+sat_model: CLIPVisionPerPatchModel = (
+    CLIPVisionPerPatchModel.from_pretrained("derektan95/search-tta-sat")
+    .to(device)
+    .eval()
+)
+# Sound CLAP model
+sound_model: ClapAudioModelWithProjection = (
+    ClapAudioModelWithProjection.from_pretrained("derektan95/search-tta-sound")
+    .to(device)
+    .eval()
+)
+sound_processor: ClapProcessor = ClapProcessor.from_pretrained("derektan95/search-tta-sound")
+SAMPLE_RATE = 48000
+logit_scale = torch.nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+logit_scale = logit_scale.exp()
+blur_kernel = (5,5)
+# ────────────────────────── transforms (exact spec) ───────────────────
+img_transform = transforms.Compose(
+    [
+        transforms.Resize((256, 256)),
+        transforms.CenterCrop((224, 224)),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225],
+        ),
+    ]
+)
+imo_transform = transforms.Compose(
+    [
+        transforms.Resize((336, 336)),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            mean=[0.485, 0.456, 0.406],
+            std=[0.229, 0.224, 0.225],
+        ),
+    ]
+)
+def get_audio_clap(path_to_audio,format="mp3",padding="repeatpad",truncation="fusion"):
+    track, sr = torchaudio.load(path_to_audio, format=format)  # torchaudio.load(path_to_audio)
+    track = track.mean(axis=0)
+    track = torchaudio.functional.resample(track, orig_freq=sr, new_freq=SAMPLE_RATE)
+    output = sound_processor(audios=track, sampling_rate=SAMPLE_RATE, max_length_s=10, return_tensors="pt",padding=padding,truncation=truncation)
+    return output
+# ────────────────────────── helpers ───────────────────────────────────
+@torch.no_grad()
+def _encode_ground(img_pil: Image.Image) -> torch.Tensor:
+    img = img_transform(img_pil).unsqueeze(0).to(device)
+    img_embeds, *_ = bio_model(img)
+    return img_embeds
+@torch.no_grad()
+def _encode_text(text: str) -> torch.Tensor:
+    toks = bio_tokenizer(text).to(device)
+    _, txt_embeds, _ = bio_model(text=toks)
+    return txt_embeds
+@torch.no_grad()
+def _encode_sat(img_pil: Image.Image) -> torch.Tensor:
+    imo = imo_transform(img_pil).unsqueeze(0).to(device)
+    imo_embeds = sat_model(imo)
+    return imo_embeds
+@torch.no_grad()
+def _encode_sound(sound) -> torch.Tensor:
+    processed_sound = get_audio_clap(sound)
+    for k in processed_sound.keys():
+        processed_sound[k] = processed_sound[k].to(device)
+    unnormalized_audio_embeds = sound_model(**processed_sound).audio_embeds
+    sound_embeds = torch.nn.functional.normalize(unnormalized_audio_embeds, dim=-1)
+    return sound_embeds
+def _similarity_heatmap(query: torch.Tensor, patches: torch.Tensor) -> np.ndarray:
+    sims = torch.matmul(query, patches.t()) * logit_scale
+    sims = sims.t().sigmoid()
+    sims = sims[1:].squeeze()  # drop CLS token
+    side = int(np.sqrt(len(sims)))
+    sims = sims.reshape(side, side)
+    return sims.cpu().detach().numpy()
+def _array_to_pil(arr: np.ndarray) -> Image.Image:
+    """
+    Render arr with viridis, automatically stretching its own min→max to 0→1
+    so that the most-similar patches appear yellow.
+    """
+    # Gausian Smoothing
+    if blur_kernel != (0,0):
+        arr = cv2.GaussianBlur(arr, blur_kernel, 0)
+    # --- contrast-stretch to local 0-1 range --------------------------
+    arr_min, arr_max = float(arr.min()), float(arr.max())
+    if arr_max - arr_min < 1e-6:        # avoid /0 when the heat-map is flat
+        arr_scaled = np.zeros_like(arr)
+    else:
+        arr_scaled = (arr - arr_min) / (arr_max - arr_min)
+    # ------------------------------------------------------------------
+    fig, ax = plt.subplots(figsize=(2.6, 2.6), dpi=96)
+    ax.imshow(arr_scaled, cmap="viridis", vmin=0.0, vmax=1.0)
+    ax.axis("off")
+    buf = io.BytesIO()
+    plt.tight_layout(pad=0)
+    fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
+    plt.close(fig)
+    buf.seek(0)
+    return Image.open(buf)
+# ────────────────────────── main inference ────────────────────────────
+# integration with ZeroGPU on hf
+@spaces.GPU
+def process(
+    sat_img: Image.Image,
+    taxonomy: str,
+    ground_img: Image.Image | None,
+    sound: torch.Tensor | None,
+):
+    if sat_img is None:
+        return None, None
+    patches = _encode_sat(sat_img)
+    heat_ground, heat_text, heat_sound = None, None, None
+    if ground_img is not None:
+        q_img = _encode_ground(ground_img)
+        heat_ground = _array_to_pil(_similarity_heatmap(q_img, patches))
+    if taxonomy.strip():
+        q_txt = _encode_text(taxonomy.strip())
+        heat_text = _array_to_pil(_similarity_heatmap(q_txt, patches))
+    if sound is not None:
+        q_sound = _encode_sound(sound)
+        heat_sound = _array_to_pil(_similarity_heatmap(q_sound, patches))
+    return heat_ground, heat_text, heat_sound
+# ────────────────────────── Gradio UI ─────────────────────────────────
+with gr.Blocks(title="Search-TTA", theme=gr.themes.Base()) as demo:
+    with gr.Row():
+        gr.Markdown(
+        """
+        <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+          <div>
+            <h1>Search-TTA: A Multimodal Test-Time Adaptation Framework for Visual Search in the Wild</h1>
+            <span></span>
+            <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+            <a href="https://search-tta.github.io">Project Website</a>
+            </h2>
+            <span></span>
+            <h2 style='font-weight: 450; font-size: 0.5rem; margin: 0rem'>[Work in Progress]</h2>
+          </div>
+        </div>
+        """
+        # <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>WACV 2025</h2>
+        #     <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+        #     <a href="https://derektan95.github.io">Derek M. S. Tan</a>,
+        #     <a href="https://chinchinati.github.io/">Shailesh</a>,
+        #     <a href="https://www.linkedin.com/in/boyang-liu-nus">Boyang Liu</a>,
+        #     <a href="https://www.linkedin.com/in/loki-silvres">Alok Raj</a>,
+        #     <a href="https://www.linkedin.com/in/ang-qi-xuan-714347142">Qi Xuan Ang</a>,
+        #     <a href="https://weihengdai.top">Weiheng Dai</a>,
+        #     <a href="https://www.linkedin.com/in/tanishqduhan">Tanishq Duhan</a>,
+        #     <a href="https://www.linkedin.com/in/jimmychiun">Jimmy Chiun</a>,
+        #     <a href="https://www.yuhongcao.online/">Yuhong Cao</a>,
+        #     <a href="https://www.cs.toronto.edu/~florian/">Florian Shkurti</a>,
+        #     <a href="https://www.marmotlab.org/bio.html">Guillaume Sartoretti</a>
+        # </h2>
+        # <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>National University of Singapore, University of Toronto, IIT-Dhanbad, Singapore Technologies Engineering</h2>
+        )
+    with gr.Row(variant="panel"):
+        # LEFT COLUMN  (satellite, taxonomy, run)
+        with gr.Column():
+            sat_input = gr.Image(
+                label="Satellite Image",
+                sources=["upload"],
+                type="pil",
+                height=320,
+            )
+            taxonomy_input = gr.Textbox(
+                label="Full Taxonomy Name (optional)",
+                placeholder="e.g. Animalia Chordata Mammalia Rodentia Sciuridae Marmota marmota",
+            )
+            # ─── NEW: sound input ───────────────────────────
+            sound_input = gr.Audio(
+                label="Sound Input (optional)",
+                sources=["upload"],     # or "microphone" / "url" as you prefer
+                type="filepath",     # or "numpy" if you want raw arrays
+            )
+            run_btn = gr.Button("Run", variant="primary")
+        # RIGHT COLUMN  (ground image + two heat-maps)
+        with gr.Column():
+            ground_input = gr.Image(
+                label="Ground-level Image (optional)",
+                sources=["upload"],
+                type="pil",
+                height=320,
+            )
+            gr.Markdown("### Heat-map Results")
+            with gr.Row():
+                # Separate label and image to avoid overlap
+                with gr.Column(scale=1, min_width=100):
+                    gr.Markdown("**Ground Image Query**", elem_id="label-ground")
+                    heat_ground_out = gr.Image(
+                        show_label=False,
+                        height=160,
+                        # width=160,
+                    )
+                with gr.Column(scale=1, min_width=100):
+                    gr.Markdown("**Text Query**", elem_id="label-text")
+                    heat_text_out = gr.Image(
+                        show_label=False,
+                        height=160,
+                        # width=160,
+                    )
+                with gr.Column(scale=1, min_width=100):
+                    gr.Markdown("**Sound Query**", elem_id="label-sound")
+                    heat_sound_out = gr.Image(
+                        show_label=False,
+                        height=160,
+                        # width=160,
+                    )
+            # ─── NEW: sound output ─────────────────────────
+            # sound_output = gr.Audio(
+            #     label="Playback",
+            # )
+    # EXAMPLES
+    with gr.Row():
+        gr.Markdown("### In-Domain Taxonomy")
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                [
+                    "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/80645_39.76079_-74.10316.jpg",
+                    "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/cc1ebaf9-899d-49f2-81c8-d452249a8087.jpg",
+                    "Animalia Chordata Aves Charadriiformes Laridae Larus marinus",
+                    "examples/Animalia_Chordata_Aves_Charadriiformes_Laridae_Larus_marinus/89758229.mp3"
+                ],
+                [
+                    "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/28871_-12.80255_-69.29999.jpg",
+                    "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/1b8064f8-7deb-4b30-98cd-69da98ba6a3d.jpg",
+                    "Animalia Chordata Mammalia Rodentia Caviidae Hydrochoerus hydrochaeris",
+                    "examples/Animalia_Chordata_Mammalia_Rodentia_Caviidae_Hydrochoerus_hydrochaeris/166631961.mp3"
+                ],
+                [
+                    "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/277303_38.72364_-75.07749.jpg",
+                    "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/0b9cc264-a2ba-44bd-8e41-0d01a6edd1e8.jpg",
+                    "Animalia Arthropoda Malacostraca Decapoda Ocypodidae Ocypode quadrata",
+                    "examples/Animalia_Arthropoda_Malacostraca_Decapoda_Ocypodidae_Ocypode_quadrata/12372063.mp3"
+                ],
+                [
+                    "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/388246_45.49036_7.14796.jpg",
+                    "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/327e1f07-692b-4140-8a3e-bd098bc064ff.jpg",
+                    "Animalia Chordata Mammalia Rodentia Sciuridae Marmota marmota",
+                    "examples/Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_marmota/59677071.mp3"
+                ],
+                [
+                    "examples/Animalia_Chordata_Reptilia_Squamata_Varanidae_Varanus_salvator/410613_5.35573_100.28948.jpg",
+                    "examples/Animalia_Chordata_Reptilia_Squamata_Varanidae_Varanus_salvator/461d8e6c-0e66-4acc-8ecd-bfd9c218bc14.jpg",
+                    "Animalia Chordata Reptilia Squamata Varanidae Varanus salvator",
+                    None
+                ],
+            ],
+            inputs=[sat_input, ground_input, taxonomy_input, sound_input],
+            outputs=[heat_ground_out, heat_text_out, heat_sound_out],
+            fn=process,
+            cache_examples=False,
+        )
+    # EXAMPLES
+    with gr.Row():
+        gr.Markdown("### Out-Domain Taxonomy")
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                [
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/27423_35.64005_-121.17595.jpg",
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/3aac526d-c921-452a-af6a-cb4f2f52e2c4.jpg",
+                    "Animalia Chordata Mammalia Carnivora Phocidae Mirounga angustirostris",
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Phocidae_Mirounga_angustirostris/3123948.mp3"
+                ],
+                [
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/1528408_13.00422_80.23033.jpg",
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/37faabd2-a613-4461-b27e-82fe5955ecaf.jpg",
+                    "Animalia Chordata Mammalia Carnivora Canidae Canis aureus",
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Canis_aureus/189318716.mp3"
+                ],
+                [
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Ursidae_Ursus_americanus/yosemite_v3_resized.png",
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Ursidae_Ursus_americanus/248820933.jpeg",
+                    "Animalia Chordata Mammalia Carnivora Ursidae Ursus americanus",
+                    None
+                ],
+                [
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Urocyon_littoralis/304160_34.0144_-119.54417.jpg",
+                    "examples/Animalia_Chordata_Mammalia_Carnivora_Canidae_Urocyon_littoralis/0cbdfbf2-6cfe-4d61-9602-c949f24d0293.jpg",
+                    "Animalia Chordata Mammalia Carnivora Canidae Urocyon littoralis",
+                    None
+                ],
+            ],
+            inputs=[sat_input, ground_input, taxonomy_input, sound_input],
+            outputs=[heat_ground_out, heat_text_out, heat_sound_out],
+            fn=process,
+            cache_examples=False,
+        )
+    # CALLBACK
+    run_btn.click(
+        fn=process,
+        inputs=[sat_input, taxonomy_input, ground_input, sound_input],
+        outputs=[heat_ground_out, heat_text_out, heat_sound_out],
+    )
+    # Footer to point out to model and data from app page.
+    gr.Markdown(
+        """
+        The satellite image CLIP encoder is fine-tuned using [Sentinel-2 Level 2A](https://docs.sentinel-hub.com/api/latest/data/sentinel-2-l2a/) satellite image and taxonomy images (with GPS locations) from [iNaturalist](https://inaturalist.org/). The sound CLIP encoder is fine-tuned with a subset of the same taxonomy images and their corresponding sounds from [iNaturalist](https://inaturalist.org/). Note that while some of the examples above result in poor probability distributions, they will be improved using our test-time adaptation framework during the search process.
+        """
+    )
+# LAUNCH
+if __name__ == "__main__":
+    demo.queue(max_size=15)
+    demo.launch(share=True)

env.py ADDED Viewed

	@@ -0,0 +1,811 @@

+#######################################################################
+# Name: env.py
+#
+# - Reads and processes training and test maps (E.g. DungeonMaps)
+# - Processes rewards, new frontiers given action
+# - Updates a graph representation of environment for input into network
+#######################################################################
+import sys
+import cv2
+from matplotlib.colors import LogNorm, PowerNorm
+if sys.modules['TRAINING']:
+    from parameter import *
+else:
+    from test_parameter import *
+import copy
+import pandas as pd
+import rasterio
+from skimage import io
+import matplotlib.pyplot as plt
+import os
+from skimage.measure import block_reduce
+from sensor import *
+from graph_generator import *
+from node import *
+from scipy.ndimage import label, find_objects
+import matplotlib.image as mpimg
+from matplotlib.colors import Normalize
+# import matplotlib
+# matplotlib.use("Agg")  # <-- key line to avoid tkinter dependency
+class Env():
+    def __init__(self, map_index, n_agent, k_size=20, plot=False, test=False, mask_index=None):
+        self.n_agent = n_agent
+        self.test = test
+        self.map_dir = GRIDMAP_SET_DIR
+        # Import environment gridmap
+        self.map_list = os.listdir(self.map_dir)
+        self.map_list.sort(reverse=True)
+        # NEW: Import segmentation utility map
+        self.seg_dir = MASK_SET_DIR
+        self.segmentation_mask, self.target_positions, self.target_found_idxs = None, [], []
+        self.segmentation_mask_list = os.listdir(self.seg_dir)
+        self.segmentation_mask_list.sort(reverse=True)
+        # Import target maps (if relevant)
+        if TARGETS_SET_DIR != "":
+            self.targets_map_list = os.listdir(TARGETS_SET_DIR)
+            self.targets_map_list.sort(reverse=True)
+        # # NEW: Find common files in both directories
+        # if TARGETS_SET_DIR != "":
+        #     common_files = [file for file in self.map_list if file in self.segmentation_mask_list and file in self.targets_map_list]
+        # else:
+        #     common_files = [file for file in self.map_list if file in self.segmentation_mask_list]
+        self.map_index = map_index % len(self.map_list)
+        if mask_index is not None:
+            self.mask_index = mask_index % len(self.segmentation_mask_list)
+        else:
+            self.mask_index = map_index % len(self.segmentation_mask_list)
+        # self.common_map_file = common_files[self.map_index]
+        # print("self.common_map_file: ", self.common_map_file)
+        # Import ground truth and segmentation mask
+        self.ground_truth, self.map_start_position = self.import_ground_truth(
+            os.path.join(self.map_dir, self.map_list[self.map_index]))# self.common_map_file))
+        self.ground_truth_size = np.shape(self.ground_truth)  # (480, 640)
+        self.robot_belief = np.ones(self.ground_truth_size) * 127  # unexplored 127
+        self.downsampled_belief = None
+        self.old_robot_belief = copy.deepcopy(self.robot_belief)
+        self.coverage_belief = np.ones(self.ground_truth_size) * 127  # unexplored 127
+        # Import segmentation mask
+        mask_filename = self.segmentation_mask_list[self.mask_index]
+        self.segmentation_mask = self.import_segmentation_mask(
+            os.path.join(self.seg_dir, mask_filename))# self.common_map_file))
+        # print("mask_filename: ", mask_filename)
+        # Overwrite target positions if directory specified
+        self.gt_segmentation_mask = None
+        if self.test and TARGETS_SET_DIR != "":
+            self.gt_segmentation_mask = self.import_segmentation_mask(
+                os.path.join(TARGETS_SET_DIR, self.map_list[self.map_index])) # UNUSED - self.common_map_file))
+        # print("target_positions: ", self.target_positions)
+        # print("np.unique(self.segmentation_mask): ", np.unique(self.segmentation_mask))
+        self.segmentation_info_mask = None
+        self.gt_segmentation_info_mask = None
+        self.segmentation_info_mask_unnormalized = None
+        self.filtered_seg_info_mask = None
+        self.num_targets_found = 0
+        self.num_new_targets_found = 0
+        # # Link score masks to raw image files
+        # csv_file = pd.read_csv(RAW_IMG_PATH_DICT, header=None)
+        # img_score_paths = csv_file.iloc[:,2].tolist()
+        # raw_img_paths = csv_file.iloc[:,0].tolist()
+        # self.score_to_img_dict = {os.path.basename(img_score_path): raw_img_path for img_score_path, raw_img_path in zip(img_score_paths, raw_img_paths)}
+        self.resolution = 4
+        self.sensor_range = SENSOR_RANGE
+        self.explored_rate = 0
+        self.targets_found_rate = 0
+        self.info_gain = 0
+        self.total_info = 0
+        self.graph_generator = Graph_generator(map_size=self.ground_truth_size, sensor_range=self.sensor_range, k_size=k_size, plot=plot)
+        self.node_coords, self.graph, self.node_utility, self.guidepost = None, None, None, None
+        self.frontiers = None
+        self.start_positions = []
+        self.begin(self.map_start_position)
+        self.plot = plot
+        self.frame_files = []
+    def find_index_from_coords(self, position):
+        index = np.argmin(np.linalg.norm(self.node_coords - position, axis=1))
+        return index
+    def begin(self, start_position):
+        # self.robot_belief = self.update_robot_belief(robot_position, self.sensor_range, self.robot_belief,
+        #                                              self.ground_truth)
+        self.robot_belief = self.ground_truth
+        self.downsampled_belief = block_reduce(self.robot_belief.copy(), block_size=(self.resolution, self.resolution),
+                                               func=np.min)
+        self.frontiers = self.find_frontier()
+        self.old_robot_belief = copy.deepcopy(self.robot_belief)
+        self.node_coords, self.graph, self.node_utility, self.guidepost = self.graph_generator.generate_graph(
+                self.robot_belief, self.frontiers)
+        # Find non-conflicting start positions
+        if FIX_START_POSITION:
+            coords_res_row = int(self.robot_belief.shape[0]/NUM_COORDS_HEIGHT)
+            coords_res_col = int(self.robot_belief.shape[1]/NUM_COORDS_WIDTH)
+            self.start_positions = [(int(self.robot_belief.shape[1]/2)-coords_res_col/2,int(self.robot_belief.shape[0]/2)-coords_res_row/2)  for _ in range(self.n_agent)]    # bottom-left corner
+        else:
+            nearby_coords = self.graph_generator.get_neighbors_grid_coords(start_position)
+            itr = 0
+            for i in range(self.n_agent):
+                if i == 0 or len(nearby_coords) == 0:
+                    self.start_positions.append(start_position)
+                else:
+                    idx = min(itr, len(nearby_coords)-1)
+                    self.start_positions.append(nearby_coords[idx])
+                    itr += 1
+        for i in range(len(self.start_positions)):
+            self.start_positions[i] = self.node_coords[self.find_index_from_coords(self.start_positions[i])]
+            self.coverage_belief = self.update_robot_belief(self.start_positions[i], self.sensor_range, self.coverage_belief,
+                                                        self.ground_truth)
+        for start_position in self.start_positions:
+            self.graph_generator.route_node.append(start_position)
+        # Info map from ground truth
+        rng_x = 0.5 * (self.ground_truth.shape[1] / NUM_COORDS_WIDTH)
+        rng_y = 0.5 * (self.ground_truth.shape[0] / NUM_COORDS_HEIGHT)
+        self.segmentation_info_mask = np.zeros((len(self.node_coords), 1))
+        self.gt_segmentation_info_mask = np.zeros((len(self.node_coords), 1))
+        for i, node_coord in enumerate(self.node_coords):
+            max_x = min(node_coord[0] + int(math.ceil(rng_x)), self.ground_truth.shape[1])
+            min_x = max(node_coord[0] - int(math.ceil(rng_x)), 0)
+            max_y = min(node_coord[1] + int(math.ceil(rng_y)), self.ground_truth.shape[0])
+            min_y = max(node_coord[1] - int(math.ceil(rng_y)), 0)
+            # if np.any(self.segmentation_mask[min_y:max_y, min_x:max_x] == 255):
+            #     self.segmentation_info_mask[i] = 1.0
+            # else:
+            #     self.segmentation_info_mask[i] = 0.0
+            # self.segmentation_info_mask[i] = np.mean(self.segmentation_mask[min_y:max_y, min_x:max_x])
+            # self.segmentation_info_mask[i] = np.max(self.segmentation_mask[min_y:max_y, min_x:max_x])
+            if TARGETS_SET_DIR == "":   # If targets combined with segmentation mask
+                exclude = {208} # Exclude target positions
+            else:
+                exclude = {}
+            self.segmentation_info_mask[i] = max(x for x in self.segmentation_mask[min_y:max_y, min_x:max_x].flatten() if x not in exclude) / 100.0
+            if self.gt_segmentation_mask is not None:
+                self.gt_segmentation_info_mask[i] = max(x for x in self.gt_segmentation_mask[min_y:max_y, min_x:max_x].flatten() if x not in exclude) / 100.0
+        # print("np.unique(self.segmentation_info_mask): ", np.unique(self.segmentation_info_mask))
+        self.filtered_seg_info_mask = copy.deepcopy(self.segmentation_info_mask)
+        # In case targets found at beginning...
+        done, num_targets_found = self.check_done()
+        self.num_targets_found = num_targets_found
+    def multi_robot_step(self, next_position_list, dist_list, travel_dist_list):
+        temp_frontiers = copy.deepcopy(self.frontiers)
+        reward_list = []
+        for dist, robot_position in zip(dist_list, next_position_list):
+            self.graph_generator.route_node.append(robot_position)
+            next_node_index = self.find_index_from_coords(robot_position)
+            self.graph_generator.nodes_list[next_node_index].set_visited()
+            # self.robot_belief = self.update_robot_belief(robot_position, self.sensor_range, self.robot_belief,
+            #                                              self.ground_truth)
+            self.coverage_belief = self.update_robot_belief(robot_position, self.sensor_range, self.coverage_belief,
+                                                         self.ground_truth)
+            self.robot_belief = self.ground_truth
+            self.downsampled_belief = block_reduce(self.robot_belief.copy(),
+                                                   block_size=(self.resolution, self.resolution),
+                                                   func=np.min)
+            frontiers = self.find_frontier()
+            #num_observed_frontiers = self.calculate_num_observed_frontiers(temp_frontiers, frontiers)
+            #temp_frontiers = frontiers
+            num_observed_frontiers = self.node_utility[next_node_index]
+            # individual_reward = num_observed_frontiers / 50 - dist / 64
+            individual_reward = -dist / 32 # 64
+            info_gain_reward = 0
+            robot_position_idx = self.find_index_from_coords(robot_position)
+            # if self.segmentation_info_mask[robot_position_idx] == 1.0 and self.guidepost[robot_position_idx] == 0.0:
+            #     # print("High Info (Unvisited)")
+            #     info_gain_reward = (HIGH_INFO_REWARD_RATIO * 1.5)
+            # elif self.segmentation_info_mask[robot_position_idx] == 0.0 and self.guidepost[robot_position_idx] == 0.0:
+            #     # print("Low Info (Unvisited)")
+            #     info_gain_reward = ((1-HIGH_INFO_REWARD_RATIO) * 1.5)
+            info_gain_reward = self.filtered_seg_info_mask[robot_position_idx][0]  * 1.5
+            if self.guidepost[robot_position_idx] == 0.0:
+                info_gain_reward += 0.2
+                # print("info_gain_reward: ", info_gain_reward)
+            individual_reward += info_gain_reward
+            # print("dist / 64: ", dist / 64)
+            # print("info gain reward: ", info_gain_reward)
+            reward_list.append(individual_reward)
+        self.node_coords, self.graph, self.node_utility, self.guidepost = self.graph_generator.update_graph(self.robot_belief, self.old_robot_belief, frontiers, self.frontiers)
+        self.old_robot_belief = copy.deepcopy(self.robot_belief)
+        self.filtered_seg_info_mask = [info[0] if self.guidepost[i] == 0.0 else 0.0 for i, info in enumerate(self.segmentation_info_mask)]
+        self.filtered_seg_info_mask = np.expand_dims(np.array(self.filtered_seg_info_mask), axis=1)
+        num_observed_frontiers = self.calculate_num_observed_frontiers(self.frontiers, frontiers)
+        self.frontiers = frontiers
+        self.explored_rate = self.evaluate_exploration_rate()
+        done, num_targets_found = self.check_done()
+        self.num_new_targets_found = num_targets_found - self.num_targets_found
+        # #team_reward = sum(reward_list) / len(reward_list)
+        # # team_reward = num_observed_frontiers / 50
+        # team_reward = self.num_new_targets_found * 5.0
+        team_reward = 0.0
+        # # print("target found reward: ", self.num_new_targets_found * 5.0)
+        self.num_targets_found = num_targets_found
+        self.targets_found_rate = self.evaluate_targets_found_rate()
+        self.info_gain, self.total_info = self.evaluate_info_gain()
+        if done:
+            # team_reward += np.sum(self.robot_belief == 255) / sum(travel_dist_list)
+            team_reward += 40 # 20
+        for i in range(len(reward_list)):
+            reward_list[i] += team_reward
+        return reward_list, done
+    def import_ground_truth(self, map_index):
+        # occupied 1, free 255, unexplored 127
+        try:
+            # ground_truth = (io.imread(map_index, 1) * 255).astype(int)
+            ground_truth = (io.imread(map_index, 1)).astype(int)
+            if np.all(ground_truth == 0):
+                ground_truth = (io.imread(map_index, 1) * 255).astype(int)
+        except:
+            new_map_index = self.map_dir + '/' + self.map_list[0]
+            ground_truth = (io.imread(new_map_index, 1)).astype(int)
+            print('could not read the map_path ({}), hence skipping it and using ({}).'.format(map_index, new_map_index))
+        robot_location = np.nonzero(ground_truth == 208)
+        # print("robot_location: ", robot_location)
+        # print("np.array(robot_location)[1, 127]: ", np.array(robot_location)[1, 127])
+        robot_location = np.array([np.array(robot_location)[1, 127], np.array(robot_location)[0, 127]])
+        ground_truth = (ground_truth > 150)
+        ground_truth = ground_truth * 254 + 1
+        return ground_truth, robot_location
+    def import_segmentation_mask(self, map_index):
+        # occupied 1, free 255, unexplored 127
+        # mask = (io.imread(map_index, 1) * 255).astype(int)    # NOTE: Cannot work well with seg mask self-generated
+        mask = cv2.imread(map_index).astype(int)
+        # print("np.unique(segmentation_mask): ", np.unique(mask))
+        # NOTE: Could contain mutiple start positions
+        # target_position = np.nonzero(mask == 208)
+        # target_positions = self.find_target_locations(mask)
+        # target_position = np.array([np.array(target_position)[1, 127], np.array(target_position)[0, 127]])
+        return mask #, target_positions
+    def find_target_locations(self, image_array, grey_value=208):
+        # Load the image
+        # image = Image.open(image_path)
+        # image_array = np.array(image)
+        # Identify pixels equal to the grey value
+        grey_pixels = np.where(image_array == grey_value)
+        # Create a binary array where grey pixels are marked as True
+        binary_array = np.zeros_like(image_array, dtype=bool)
+        binary_array[grey_pixels] = True
+        # Label connected components
+        labeled_array, num_features = label(binary_array)
+        # Find objects returns slices for each connected component
+        slices = find_objects(labeled_array)
+        # Calculate the center of each box
+        centers = []
+        for slice in slices:
+            row_center = (slice[0].start + slice[0].stop - 1) // 2
+            col_center = (slice[1].start + slice[1].stop - 1) // 2
+            centers.append((col_center, row_center))    # (y,x)
+        return centers
+    def free_cells(self):
+        index = np.where(self.ground_truth == 255)
+        free = np.asarray([index[1], index[0]]).T
+        return free
+    def update_robot_belief(self, robot_position, sensor_range, robot_belief, ground_truth):
+        robot_belief = sensor_work(robot_position, sensor_range, robot_belief, ground_truth)
+        return robot_belief
+    def check_done(self, robot_id=0):
+        """ All agnets to have explored most of the env map """
+        done = False
+        # for idx in range(self.n_agent):
+        #     if np.sum(self.ground_truth == 255) - np.sum(self.all_robot_belief[idx][idx] == 255) > 40:
+        #         done = False
+        # NEW: ADDITIONAL VLM SEARCH CRITERIA
+        num_targets_found = 0
+        self.target_found_idxs = []
+        for i, target in enumerate(self.target_positions):
+            if self.coverage_belief[target[1], target[0]] == 255: # 255:
+                num_targets_found += 1
+                self.target_found_idxs.append(i)
+            # free_cells_mask = self.all_robot_belief[robot_id][robot_id] == 255
+            # filtered_segmentation_mask = np.where(free_cells_mask, self.segmentation_mask, 0)
+            # targets = self.find_target_locations(filtered_segmentation_mask)
+            # print("num_targets_found: ", num_targets_found)
+        if TERMINATE_ON_TGTS_FOUND and num_targets_found >= len(self.target_positions):
+            done = True
+        if not TERMINATE_ON_TGTS_FOUND and np.sum(self.coverage_belief == 255) / np.sum(self.ground_truth == 255) >= 0.99:
+            done = True
+        return done, num_targets_found
+    def calculate_num_observed_frontiers(self, old_frontiers, frontiers):
+        frontiers_to_check = frontiers[:, 0] + frontiers[:, 1] * 1j
+        pre_frontiers_to_check = old_frontiers[:, 0] + old_frontiers[:, 1] * 1j
+        frontiers_num = np.intersect1d(frontiers_to_check, pre_frontiers_to_check).shape[0]
+        pre_frontiers_num = pre_frontiers_to_check.shape[0]
+        delta_num = pre_frontiers_num - frontiers_num
+        return delta_num
+    def calculate_reward(self, dist, frontiers):
+        reward = 0
+        reward -= dist / 64
+        frontiers_to_check = frontiers[:, 0] + frontiers[:, 1] * 1j
+        pre_frontiers_to_check = self.frontiers[:, 0] + self.frontiers[:, 1] * 1j
+        frontiers_num = np.intersect1d(frontiers_to_check, pre_frontiers_to_check).shape[0]
+        pre_frontiers_num = pre_frontiers_to_check.shape[0]
+        delta_num = pre_frontiers_num - frontiers_num
+        reward += delta_num / 50
+        return reward
+    def evaluate_exploration_rate(self):
+        # rate = np.sum(self.robot_belief == 255) / np.sum(self.ground_truth == 255)
+        rate = np.sum(self.coverage_belief == 255) / np.sum(self.ground_truth == 255)
+        return rate
+    def evaluate_targets_found_rate(self):
+        if len(self.target_positions) == 0:
+            return 0
+        else:
+            rate = self.num_targets_found / len(self.target_positions)
+            return rate
+    def evaluate_info_gain(self):
+        # print("self.segmentation_mask.shape: ", self.segmentation_mask.shape)
+        # coverage_belief = (self.coverage_belief == 255)
+        # print("coverage_belief.shape: ", coverage_belief.shape)
+        # print("np.unique(coverage_belief): ", np.unique(coverage_belief))
+        # print("np.count_nonzero(coverage_belief): ", np.count_nonzero(coverage_belief))
+        # print("np.count_zero(coverage_belief): ", coverage_belief.size - np.count_nonzero(coverage_belief))
+        # print("self.segmentation_mask[self.coverage_belief == 255].shape: ", self.segmentation_mask[self.coverage_belief == 255].shape)
+        if self.test and TARGETS_SET_DIR != "":
+            info_gained = np.sum(self.gt_segmentation_mask[self.coverage_belief == 255]) / 100.0
+            total_info = np.sum(self.gt_segmentation_mask) / 100.0
+        else:
+            info_gained = np.sum(self.segmentation_mask[self.coverage_belief == 255]) / 100.0
+            total_info = np.sum(self.segmentation_mask) / 100.0
+        return info_gained, total_info
+    def calculate_new_free_area(self):
+        old_free_area = self.old_robot_belief == 255
+        current_free_area = self.robot_belief == 255
+        new_free_area = (current_free_area.astype(np.int) - old_free_area.astype(np.int)) * 255
+        return new_free_area, np.sum(old_free_area)
+    def calculate_dist_path(self, path):
+        dist = 0
+        start = path[0]
+        end = path[-1]
+        for index in path:
+            if index == end:
+                break
+            dist += np.linalg.norm(self.node_coords[start] - self.node_coords[index])
+            start = index
+        return dist
+    def find_frontier(self):
+        y_len = self.downsampled_belief.shape[0]
+        x_len = self.downsampled_belief.shape[1]
+        mapping = self.downsampled_belief.copy()
+        belief = self.downsampled_belief.copy()
+        # 0-1 unknown area map
+        mapping = (mapping == 127) * 1
+        mapping = np.lib.pad(mapping, ((1, 1), (1, 1)), 'constant', constant_values=0)
+        fro_map = mapping[2:][:, 1:x_len + 1] + mapping[:y_len][:, 1:x_len + 1] + mapping[1:y_len + 1][:, 2:] + \
+                  mapping[1:y_len + 1][:, :x_len] + mapping[:y_len][:, 2:] + mapping[2:][:, :x_len] + mapping[2:][:,
+                                                                                                      2:] + \
+                  mapping[:y_len][:, :x_len]
+        ind_free = np.where(belief.ravel(order='F') == 255)[0]
+        ind_fron_1 = np.where(1 < fro_map.ravel(order='F'))[0]
+        ind_fron_2 = np.where(fro_map.ravel(order='F') < 8)[0]
+        ind_fron = np.intersect1d(ind_fron_1, ind_fron_2)
+        ind_to = np.intersect1d(ind_free, ind_fron)
+        map_x = x_len
+        map_y = y_len
+        x = np.linspace(0, map_x - 1, map_x)
+        y = np.linspace(0, map_y - 1, map_y)
+        t1, t2 = np.meshgrid(x, y)
+        points = np.vstack([t1.T.ravel(), t2.T.ravel()]).T
+        f = points[ind_to]
+        f = f.astype(int)
+        f = f * self.resolution
+        return f
+    def plot_env(self, n, path, step, travel_dist, robots_route, img_path_override=None, sat_path_override=None, msk_name_override=None, sound_id_override=None, colormap_mid_val=None):
+        # # TEMP
+        # if TAXABIND_TTA:
+        #     # Save self.segmentation_info_mask as .npy file in gifs_path
+        #     side_dim = int(np.sqrt(self.segmentation_info_mask.shape[0]))
+        #     mask_viz = self.segmentation_info_mask.squeeze().reshape((side_dim, side_dim)).T
+        #     np.save(os.path.join(path, f"seg_mask_step{step}.npy"), mask_viz)
+        plt.switch_backend('agg')
+        # plt.ion()
+        plt.cla()
+        color_list = ["r", "g", "c", "m", "y", "k"]
+        if TARGETS_SET_DIR == "" and not TAXABIND_TTA:
+            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
+        else:
+            fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(12, 8))
+        ### Fig1: Environment ###
+        msk_name = ""
+        if TAXABIND_TTA:
+            image = mpimg.imread(sat_path_override)
+            msk_name = msk_name_override
+        # else:
+            # plt.imshow(self.robot_belief, cmap='gray')
+            # ax1.imshow(self.coverage_belief, cmap='gray')
+            # image = mpimg.imread("Maps/real_maps/real/4259_masked_img_0.jpg")
+            # msk_name = self.map_list[self.map_index]
+            # raw_img_path = self.score_to_img_dict[msk_name]
+            # if "flair" in raw_img_path:
+            #     with rasterio.open(raw_img_path) as src_img:
+            #         image = src_img.read([1,2,3])
+            #         image = np.transpose(image, (1, 2, 0))
+            # else:
+            #     image = mpimg.imread(raw_img_path)
+            ### Fig1: Environment ###
+            ax = ax1 # if TAXABIND_TTA else ax1
+            ax.imshow(image)
+            ax.axis((0, self.ground_truth_size[1], self.ground_truth_size[0], 0))
+            ax.set_title("Image")
+            # if VIZ_GRAPH_EDGES:
+            #     for i in range(len(self.graph_generator.x)):
+            #         ax.plot(self.graph_generator.x[i], self.graph_generator.y[i], 'tan', zorder=1)
+            # # ax.scatter(self.node_coords[:, 0], self.node_coords[:, 1], c=self.node_utility, zorder=5)
+            # ax.scatter(self.node_coords[:, 0], self.node_coords[:, 1], c=self.segmentation_info_mask, zorder=5)
+            # ax.scatter(self.frontiers[:, 0], self.frontiers[:, 1], c='r', s=2, zorder=3)
+            for i, route in enumerate(robots_route):
+                robot_marker_color = color_list[i % len(color_list)]
+                xPoints = route[0]
+                yPoints = route[1]
+                ax.plot(xPoints, yPoints, c=robot_marker_color, linewidth=2)
+                # ax.plot(xPoints[-1], yPoints[-1], 'mo', markersize=8, zorder=10)
+                ax.plot(xPoints[-1], yPoints[-1], markersize=12, zorder=99, marker="^", ls="-", c=robot_marker_color, mec="black")
+                ax.plot(xPoints[0], yPoints[0], 'co', c=robot_marker_color, markersize=8, zorder=5)
+                # Sensor range
+                rng_x = 0.5 * (self.ground_truth.shape[1] / NUM_COORDS_WIDTH)
+                rng_y = 0.5 * (self.ground_truth.shape[0] / NUM_COORDS_HEIGHT)
+                max_x = min(xPoints[-1] + int(math.ceil(rng_x)), self.ground_truth.shape[1])
+                min_x = max(xPoints[-1] - int(math.ceil(rng_x)), 0)
+                max_y = min(yPoints[-1] + int(math.ceil(rng_y)), self.ground_truth.shape[0])
+                min_y = max(yPoints[-1] - int(math.ceil(rng_y)), 0)
+                ax.plot((min_x, min_x), (min_y, max_y), c=robot_marker_color, linewidth=1)
+                ax.plot((min_x, max_x), (max_y, max_y), c=robot_marker_color, linewidth=1)
+                ax.plot((max_x, max_x), (max_y, min_y), c=robot_marker_color, linewidth=1)
+                ax.plot((max_x, min_x), (min_y, min_y), c=robot_marker_color, linewidth=1)
+        ### Fig2: Graph  ###
+        ax = ax4 if TAXABIND_TTA else ax1
+        # ax.imshow(image)
+        ax.imshow(self.coverage_belief, cmap='gray')
+        ax.axis((0, self.ground_truth_size[1], self.ground_truth_size[0], 0))
+        ax.set_title("Information Graph")
+        if VIZ_GRAPH_EDGES:
+            for i in range(len(self.graph_generator.x)):
+                ax.plot(self.graph_generator.x[i], self.graph_generator.y[i], 'tan', zorder=1)
+        # ax.scatter(self.node_coords[:, 0], self.node_coords[:, 1], c=self.node_utility, zorder=5)
+        # ax.scatter(self.node_coords[:, 0], self.node_coords[:, 1], c=self.segmentation_info_mask, zorder=5)
+        # filtered_seg_info_mask = [info[0] if self.guidepost[i] == 0.0 else 0.0 for i, info in enumerate(self.segmentation_info_mask)]
+        ax.scatter(self.node_coords[:, 0], self.node_coords[:, 1], c=self.filtered_seg_info_mask, zorder=5, s=8)
+        # ax.scatter(self.frontiers[:, 0], self.frontiers[:, 1], c='r', s=2, zorder=3)
+        for i, route in enumerate(robots_route):
+            robot_marker_color = color_list[i % len(color_list)]
+            xPoints = route[0]
+            yPoints = route[1]
+            ax.plot(xPoints, yPoints, c=robot_marker_color, linewidth=2)
+            # ax.plot(xPoints[-1], yPoints[-1], 'mo', markersize=8, zorder=10)
+            ax.plot(xPoints[-1], yPoints[-1], markersize=12, zorder=99, marker="^", ls="-", c=robot_marker_color, mec="black")
+            ax.plot(xPoints[0], yPoints[0], 'co', c=robot_marker_color, markersize=8, zorder=5)
+            # Sensor range
+            rng_x = 0.5 * (self.ground_truth.shape[1] / NUM_COORDS_WIDTH)
+            rng_y = 0.5 * (self.ground_truth.shape[0] / NUM_COORDS_HEIGHT)
+            max_x = min(xPoints[-1] + int(math.ceil(rng_x)), self.ground_truth.shape[1])
+            min_x = max(xPoints[-1] - int(math.ceil(rng_x)), 0)
+            max_y = min(yPoints[-1] + int(math.ceil(rng_y)), self.ground_truth.shape[0])
+            min_y = max(yPoints[-1] - int(math.ceil(rng_y)), 0)
+            ax.plot((min_x, min_x), (min_y, max_y), c=robot_marker_color, linewidth=1)
+            ax.plot((min_x, max_x), (max_y, max_y), c=robot_marker_color, linewidth=1)
+            ax.plot((max_x, max_x), (max_y, min_y), c=robot_marker_color, linewidth=1)
+            ax.plot((max_x, min_x), (min_y, min_y), c=robot_marker_color, linewidth=1)
+        # Plot target positions
+        for target in self.target_positions:
+            if self.coverage_belief[target[1], target[0]] == 255:
+                # ax.plot(target[0], target[1], 'go', markersize=8, zorder=99)
+                ax.plot(target[0], target[1], color='g', marker='x', linestyle='-', markersize=12, markeredgewidth=4, zorder=99)
+            else:
+                # ax.plot(target[0], target[1], 'ro', markersize=8, zorder=99)
+                ax.plot(target[0], target[1], color='r', marker='x', linestyle='-', markersize=12, markeredgewidth=4, zorder=99)
+        # ax.pause(0.1)
+        ### Fig3: Segmentation Mask ###
+        ax = ax5 if TAXABIND_TTA else ax2
+        if TAXABIND_TTA and USE_CLIP_PREDS:
+            side_dim = int(np.sqrt(self.segmentation_info_mask.shape[0]))
+            mask_viz = self.segmentation_info_mask.squeeze().reshape((side_dim, side_dim)).T
+            scale_y = math.ceil(self.ground_truth_size[1] / side_dim)
+            scale_x = math.ceil(self.ground_truth_size[0] / side_dim)
+            upscaled_mask_viz = np.kron(mask_viz, np.ones((scale_y, scale_x)))  # Integer scaling only
+            upscaled_mask_viz = upscaled_mask_viz[:self.ground_truth_size[1], :self.ground_truth_size[0]]
+            im = ax.imshow(upscaled_mask_viz, cmap="viridis")
+            ax.axis("off")
+        else:
+            im = ax.imshow(self.segmentation_mask.mean(axis=-1), cmap='viridis', vmin=0, vmax=100)  # cmap='gray'
+            ax.axis((0, self.ground_truth_size[1], self.ground_truth_size[0], 0))
+        ax.set_title(f"Predicted Mask (Normalized)")
+        for i, route in enumerate(robots_route):
+            robot_marker_color = color_list[i % len(color_list)]
+            xPoints = route[0]
+            yPoints = route[1]
+            ax.plot(xPoints, yPoints, c=robot_marker_color, linewidth=2)
+            # ax.plot(xPoints[-1], yPoints[-1], 'mo', markersize=8, zorder=10)
+            ax.plot(xPoints[-1], yPoints[-1], markersize=12, zorder=99, marker="^", ls="-", c=robot_marker_color, mec="black")
+            ax.plot(xPoints[0], yPoints[0], 'co', c=robot_marker_color, markersize=8, zorder=5)
+            # Sensor range
+            rng_x = 0.5 * (self.ground_truth.shape[1] / NUM_COORDS_WIDTH)
+            rng_y = 0.5 * (self.ground_truth.shape[0] / NUM_COORDS_HEIGHT)
+            max_x = min(xPoints[-1] + int(math.ceil(rng_x)), self.ground_truth.shape[1])
+            min_x = max(xPoints[-1] - int(math.ceil(rng_x)), 0)
+            max_y = min(yPoints[-1] + int(math.ceil(rng_y)), self.ground_truth.shape[0])
+            min_y = max(yPoints[-1] - int(math.ceil(rng_y)), 0)
+            ax.plot((min_x, min_x), (min_y, max_y), c=robot_marker_color, linewidth=1)
+            ax.plot((min_x, max_x), (max_y, max_y), c=robot_marker_color, linewidth=1)
+            ax.plot((max_x, max_x), (max_y, min_y), c=robot_marker_color, linewidth=1)
+            ax.plot((max_x, min_x), (min_y, min_y), c=robot_marker_color, linewidth=1)
+        # Add a colorbar
+        cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+        cbar.set_label("Normalized Probs")
+        # ax.pause(0.1)
+        ### Fig4: Segmentation Mask ###
+        if TAXABIND_TTA and USE_CLIP_PREDS:
+            ax = ax6
+            side_dim = int(np.sqrt(self.segmentation_info_mask_unnormalized.shape[0]))
+            mask_viz = self.segmentation_info_mask_unnormalized.squeeze().reshape((side_dim, side_dim)).T
+            scale_y = math.ceil(self.ground_truth_size[1] / side_dim)
+            scale_x = math.ceil(self.ground_truth_size[0] / side_dim)
+            upscaled_mask_viz = np.kron(mask_viz, np.ones((scale_y, scale_x)))  # Integer scaling only
+            upscaled_mask_viz = upscaled_mask_viz[:self.ground_truth_size[1], :self.ground_truth_size[0]]
+            max_val = 0.15  # TO CHANGE
+            mid_val = colormap_mid_val if colormap_mid_val is not None else 0.05
+            # mid_val = np.max(self.segmentation_info_mask_unnormalized)
+            norm = CustomNorm(vmin=0.0, vmax=max_val, mid=mid_val, lower_portion=0.8)
+            im = ax.imshow(upscaled_mask_viz, cmap="viridis", norm=norm)  # norm=LogNorm(vmin=0.01, vmax=0.1))
+            # norm = PowerNorm(gamma=0.25, vmin=0.01, vmax=0.2)
+            # norm=LogNorm(vmin=0.01, vmax=0.2)
+            im = ax.imshow(upscaled_mask_viz, cmap="viridis", norm=norm)  # norm=LogNorm(vmin=0.01, vmax=0.1))
+            ax.axis("off")
+        # else:
+        #     im = ax.imshow(self.segmentation_mask.mean(axis=-1), cmap='viridis', vmin=0, vmax=100)  # cmap='gray'
+        #     ax.axis((0, self.ground_truth_size[1], self.ground_truth_size[0], 0))
+            ax.set_title(f"Predicted Mask (Unnormalized)")
+            for i, route in enumerate(robots_route):
+                robot_marker_color = color_list[i % len(color_list)]
+                xPoints = route[0]
+                yPoints = route[1]
+                ax.plot(xPoints, yPoints, c=robot_marker_color, linewidth=2)
+                # ax.plot(xPoints[-1], yPoints[-1], 'mo', markersize=8, zorder=10)
+                ax.plot(xPoints[-1], yPoints[-1], markersize=12, zorder=99, marker="^", ls="-", c=robot_marker_color, mec="black")
+                ax.plot(xPoints[0], yPoints[0], 'co', c=robot_marker_color, markersize=8, zorder=5)
+                # Sensor range
+                rng_x = 0.5 * (self.ground_truth.shape[1] / NUM_COORDS_WIDTH)
+                rng_y = 0.5 * (self.ground_truth.shape[0] / NUM_COORDS_HEIGHT)
+                max_x = min(xPoints[-1] + int(math.ceil(rng_x)), self.ground_truth.shape[1])
+                min_x = max(xPoints[-1] - int(math.ceil(rng_x)), 0)
+                max_y = min(yPoints[-1] + int(math.ceil(rng_y)), self.ground_truth.shape[0])
+                min_y = max(yPoints[-1] - int(math.ceil(rng_y)), 0)
+                ax.plot((min_x, min_x), (min_y, max_y), c=robot_marker_color, linewidth=1)
+                ax.plot((min_x, max_x), (max_y, max_y), c=robot_marker_color, linewidth=1)
+                ax.plot((max_x, max_x), (max_y, min_y), c=robot_marker_color, linewidth=1)
+                ax.plot((max_x, min_x), (min_y, min_y), c=robot_marker_color, linewidth=1)
+            # Add a colorbar
+            cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+            if TAXABIND_TTA and USE_CLIP_PREDS:
+                cbar.set_ticks([0.0, mid_val, max_val])
+            cbar.set_label("Probs (Scaled by expectation)")
+        # Fog5: GT Mask
+        if TARGETS_SET_DIR != "":
+            ax = ax2
+            im = ax.imshow(self.gt_segmentation_mask.mean(axis=-1), cmap='viridis', vmin=0, vmax=100)  # cmap='gray'
+            ax.axis((0, self.ground_truth_size[1], self.ground_truth_size[0], 0))
+            ax.set_title(f"Ground Truth Mask")
+            for i, route in enumerate(robots_route):
+                robot_marker_color = color_list[i % len(color_list)]
+                xPoints = route[0]
+                yPoints = route[1]
+                ax.plot(xPoints, yPoints, c=robot_marker_color, linewidth=2)
+                # ax.plot(xPoints[-1], yPoints[-1], 'mo', markersize=8, zorder=10)
+                ax.plot(xPoints[-1], yPoints[-1], markersize=12, zorder=99, marker="^", ls="-", c=robot_marker_color, mec="black")
+                ax.plot(xPoints[0], yPoints[0], 'co', c=robot_marker_color, markersize=8, zorder=5)
+                # Sensor range
+                rng_x = 0.5 * (self.ground_truth.shape[1] / NUM_COORDS_WIDTH)
+                rng_y = 0.5 * (self.ground_truth.shape[0] / NUM_COORDS_HEIGHT)
+                max_x = min(xPoints[-1] + int(math.ceil(rng_x)), self.ground_truth.shape[1])
+                min_x = max(xPoints[-1] - int(math.ceil(rng_x)), 0)
+                max_y = min(yPoints[-1] + int(math.ceil(rng_y)), self.ground_truth.shape[0])
+                min_y = max(yPoints[-1] - int(math.ceil(rng_y)), 0)
+                ax.plot((min_x, min_x), (min_y, max_y), c=robot_marker_color, linewidth=1)
+                ax.plot((min_x, max_x), (max_y, max_y), c=robot_marker_color, linewidth=1)
+                ax.plot((max_x, max_x), (max_y, min_y), c=robot_marker_color, linewidth=1)
+                ax.plot((max_x, min_x), (min_y, min_y), c=robot_marker_color, linewidth=1)
+            # Add a colorbar
+            cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+            cbar.set_label("Normalized Mask Value")
+            # ax4.pause(0.1)
+        ### Fig6: Segmentation Mask (GT) ###
+        if TAXABIND_TTA:
+            ax = ax3
+            image = mpimg.imread(img_path_override)
+            ax.imshow(image)
+            ax.set_title("Ground Image")
+            ax.axis("off")
+        sound_id = sound_id_override if sound_id_override is not None else "-1"
+        plt.suptitle('Targets Found: {}/{}  Coverage ratio: {:.4g}  Travel Dist: {:.4g}  Info Gain: {:.4g}% \n ({}) \n (Sound ID: {})'.format(self.num_targets_found, \
+                                                                                    len(self.target_positions), self.explored_rate, travel_dist, (100*self.info_gain/self.total_info), msk_name,
+                                                                                    sound_id))
+        plt.tight_layout()
+        plt.savefig('{}/{}_{}_samples.png'.format(path, n, step, dpi=100))
+        # plt.show()
+        frame = '{}/{}_{}_samples.png'.format(path, n, step)
+        self.frame_files.append(frame)
+        plt.close()
+####################
+class CustomNorm(Normalize):
+    """
+    A custom normalization that allocates a larger fraction of the colormap
+    to the lower data range [vmin, mid] than to [mid, vmax].
+    Parameters
+    ----------
+    vmin : float
+        Minimum data value
+    vmax : float
+        Maximum data value
+    mid : float
+        Midpoint in data where we switch from 'lower' to 'upper' mapping
+    lower_portion : float
+        Fraction of the colormap to allocate for [vmin, mid].
+        For example, 0.8 => 80% of colors for [vmin, mid], 20% for [mid, vmax].
+    clip : bool
+        Whether to clip data outside [vmin, vmax].
+    """
+    def __init__(self, vmin=None, vmax=None, mid=0.05, lower_portion=0.8, clip=False):
+        self.mid = mid
+        self.lower_portion = lower_portion
+        super().__init__(vmin, vmax, clip)
+    def __call__(self, value, clip=None):
+        """Forward transform: data -> [0..1] color space."""
+        vmin, vmax, mid = self.vmin, self.vmax, self.mid
+        lp = self.lower_portion
+        value = np.asarray(value, dtype=np.float64)
+        # Piecewise linear mapping:
+        # [vmin..mid] => [0..lp]
+        # [mid..vmax] => [lp..1]
+        normed = np.where(
+            value <= mid,
+            lp * (value - vmin) / (mid - vmin),
+            lp + (value - mid) / (vmax - mid) * (1 - lp)
+        )
+        return np.clip(normed, 0, 1)
+    def inverse(self, value):
+        """
+        Inverse transform: [0..1] color space -> data space.
+        Matplotlib's colorbar calls this to place ticks correctly.
+        """
+        vmin, vmax, mid = self.vmin, self.vmax, self.mid
+        lp = self.lower_portion
+        value = np.asarray(value, dtype=np.float64)
+        # For color space [0..lp], invert to [vmin..mid]
+        # For color space [lp..1], invert to [mid..vmax]
+        below = (value <= lp)
+        above = ~below
+        # Allocate array for results
+        data = np.zeros_like(value, dtype=np.float64)
+        # Invert lower segment
+        data[below] = vmin + (value[below] / lp) * (mid - vmin)
+        # Invert upper segment
+        data[above] = mid + ((value[above] - lp) / (1 - lp)) * (vmax - mid)
+        return data

graph.py ADDED Viewed

	@@ -0,0 +1,179 @@

+#######################################################################
+# Name: env.py
+#
+# - Adapted from https://gist.github.com/betandr/541a1f6466b6855471de5ca30b74cb31
+# - Simple graph class to perform distance calculations (E.g. A-Star, Djikstra)
+#######################################################################
+from decimal import Decimal
+class Edge:
+    def __init__(self, to_node, length):
+        self.to_node = to_node
+        self.length = length
+class Graph:
+    def __init__(self):
+        self.nodes = set()
+        self.edges = dict()
+    def add_node(self, node):
+        self.nodes.add(node)
+    def add_edge(self, from_node, to_node, length):
+        edge = Edge(to_node, length)
+        # edge = to_node
+        if from_node in self.edges:
+            from_node_edges = self.edges[from_node]
+        else:
+            self.edges[from_node] = dict()
+            from_node_edges = self.edges[from_node]
+        # if edge not in from_node_edges:
+        # from_node_edges.append(edge)
+        from_node_edges[to_node] = edge
+    def clear_edge(self, from_node):
+        if from_node in self.edges:
+            self.edges[from_node] = dict()
+        # else:
+            # print("no edge node or no this node")
+def min_dist(q, dist):
+    """
+    Returns the node with the smallest distance in q.
+    Implemented to keep the main algorithm clean.
+    """
+    min_node = None
+    for node in q:
+        if min_node == None:
+            min_node = node
+        elif dist[node] < dist[min_node]:
+            min_node = node
+    return min_node
+INFINITY = float('Infinity')
+def dijkstra(graph, source):
+    q = set()
+    dist = {}
+    prev = {}
+    for v in graph.nodes:       # initialization
+        dist[v] = INFINITY      # unknown distance from source to v
+        prev[v] = INFINITY      # previous node in optimal path from source
+        q.add(v)                # all nodes initially in q (unvisited nodes)
+    # distance from source to source
+    dist[source] = 0
+    while q:
+        # node with the least distance selected first
+        u = min_dist(q, dist)
+        q.remove(u)
+        try:
+            if u in graph.edges:
+                for _, v in graph.edges[u].items():
+                    alt = dist[u] + v.length
+                    if alt < dist[v.to_node]:
+                        # a shorter path to v has been found
+                        dist[v.to_node] = alt
+                        prev[v.to_node] = u
+        except:
+            pass
+    return dist, prev
+def to_array(prev, from_node):
+    """Creates an ordered list of labels as a route."""
+    previous_node = prev[from_node]
+    route = [from_node]
+    while previous_node != INFINITY:
+        route.append(previous_node)
+        temp = previous_node
+        previous_node = prev[temp]
+    route.reverse()
+    return route
+def h(index, destination, node_coords):
+    current = node_coords[index]
+    end = node_coords[destination]
+    h = abs(end[0] - current[0]) + abs(end[1] - current[1])
+    # h = ((end[0]-current[0])**2 + (end[1] - current[1])**2)**(1/2)
+    return h
+def a_star(start, destination, node_coords, graph):
+    if start == destination:
+        return [], 0
+    if str(destination) in graph.edges[str(start)].keys():
+        cost = graph.edges[str(start)][str(destination)].length
+        return [start, destination], cost
+    open_list = {start}
+    closed_list = set([])
+    g = {start: 0}
+    parents = {start: start}
+    while len(open_list) > 0:
+        n = None
+        h_n = 1e5
+        # print('open list', open_list)
+        for v in open_list:
+            h_v = h(v, destination, node_coords)
+            if n is not None:
+                h_n = h(n, destination, node_coords)
+            if n is None or g[v] + h_v < g[n] + h_n:
+                n = v
+        if n is None:
+            print('Path does not exist!')
+            return None, 1e5
+        if n == destination:
+            reconst_path = []
+            while parents[n] != n:
+                reconst_path.append(n)
+                n = parents[n]
+            reconst_path.append(start)
+            reconst_path.reverse()
+            # print('Path found: {}'.format(reconst_path))
+            # print(g[destination])
+            return reconst_path, g[destination]
+        for edge in graph.edges[str(n)].values():
+            m = int(edge.to_node)
+            cost = edge.length
+            # print(m, cost)
+            if m not in open_list and m not in closed_list:
+                open_list.add(m)
+                parents[m] = n
+                g[m] = g[n] + cost
+            else:
+                if g[m] > g[n] + cost:
+                    g[m] = g[n] + cost
+                    parents[m] = n
+                    if m in closed_list:
+                        closed_list.remove(m)
+                        open_list.add(m)
+        open_list.remove(n)
+        closed_list.add(n)
+    print('Path does not exist!')
+    return None, 1e5

graph_generator.py ADDED Viewed

	@@ -0,0 +1,330 @@

+#######################################################################
+# Name: graph_generator.py
+#
+# - Wrapper for graph.py
+# - Sends the formatted inputs into graph.py to get useful info
+#######################################################################
+import sys
+if sys.modules['TRAINING']:
+    from parameter import *
+else:
+    from test_parameter import *
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+import shapely.geometry
+from node import Node
+from graph import Graph, a_star
+class Graph_generator:
+    def __init__(self, map_size, k_size, sensor_range, plot=False):
+        self.k_size = k_size
+        self.graph = Graph()
+        self.node_coords = None
+        self.plot = plot
+        self.x = []
+        self.y = []
+        self.map_x = map_size[1]
+        self.map_y = map_size[0]
+        self.uniform_points, self.grid_coords = self.generate_uniform_points()
+        self.sensor_range = sensor_range
+        self.route_node = []
+        self.nodes_list = []
+        self.node_utility = None
+        self.guidepost = None
+    def edge_clear_all_nodes(self):
+        self.graph = Graph()
+        self.x = []
+        self.y = []
+    def edge_clear(self, coords):
+        node_index = str(self.find_index_from_coords(self.node_coords, coords))
+        self.graph.clear_edge(node_index)
+    def generate_graph(self, robot_belief, frontiers):
+        self.edge_clear_all_nodes()
+        free_area = self.free_area(robot_belief)
+        free_area_to_check = free_area[:, 0] + free_area[:, 1] * 1j
+        uniform_points_to_check = self.uniform_points[:, 0] + self.uniform_points[:, 1] * 1j
+        _, _, candidate_indices = np.intersect1d(free_area_to_check, uniform_points_to_check, return_indices=True)
+        node_coords = self.uniform_points[candidate_indices]
+        # node_coords = np.concatenate((robot_location.reshape(1, 2), node_coords))
+        self.node_coords = self.unique_coords(node_coords).reshape(-1, 2)
+        # self.find_k_neighbor_all_nodes(self.node_coords, robot_belief)
+        self.find_nearest_neighbor_all_nodes(self.node_coords, robot_belief)
+        self.node_utility = []
+        for coords in self.node_coords:
+            node = Node(coords, frontiers, robot_belief)
+            self.nodes_list.append(node)
+            utility = node.utility
+            self.node_utility.append(utility)
+        self.node_utility = np.array(self.node_utility)
+        self.guidepost = np.zeros((self.node_coords.shape[0], 1))
+        x = self.node_coords[:,0] + self.node_coords[:,1]*1j
+        for node in self.route_node:
+            index = np.argwhere(x.reshape(-1) == node[0]+node[1]*1j)[0]
+            self.guidepost[index] = 1
+        return self.node_coords, self.graph.edges, self.node_utility, self.guidepost
+    def update_graph(self, robot_belief, old_robot_belief, frontiers, old_frontiers):
+        new_free_area = self.free_area((robot_belief - old_robot_belief > 0) * 255)
+        free_area_to_check = new_free_area[:, 0] + new_free_area[:, 1] * 1j
+        uniform_points_to_check = self.uniform_points[:, 0] + self.uniform_points[:, 1] * 1j
+        _, _, candidate_indices = np.intersect1d(free_area_to_check, uniform_points_to_check, return_indices=True)
+        new_node_coords = self.uniform_points[candidate_indices]
+        self.node_coords = np.concatenate((self.node_coords, new_node_coords))
+        old_node_to_update = []
+        for coords in new_node_coords:
+            neighbor_indices = self.find_k_neighbor(coords, self.node_coords, robot_belief)
+            old_node_to_update += neighbor_indices
+        old_node_to_update = set(old_node_to_update)
+        for index in old_node_to_update:
+            coords = self.node_coords[index]
+            self.edge_clear(coords)
+            self.find_k_neighbor(coords, self.node_coords, robot_belief)
+        #self.edge_clear_all_nodes()
+        #self.find_k_neighbor_all_nodes(self.node_coords, robot_belief)
+        old_frontiers_to_check = old_frontiers[:, 0] + old_frontiers[:, 1] * 1j
+        new_frontiers_to_check = frontiers[:, 0] + frontiers[:, 1] * 1j
+        observed_frontiers_index = np.where(
+            np.isin(old_frontiers_to_check, new_frontiers_to_check, assume_unique=True) == False)
+        new_frontiers_index = np.where(
+            np.isin(new_frontiers_to_check, old_frontiers_to_check, assume_unique=True) == False)
+        observed_frontiers = old_frontiers[observed_frontiers_index]
+        new_frontiers = frontiers[new_frontiers_index]
+        for node in self.nodes_list:
+            if node.zero_utility_node is True:
+                pass
+            else:
+                node.update_observable_frontiers(observed_frontiers, new_frontiers, robot_belief)
+        for new_coords in new_node_coords:
+            node = Node(new_coords, frontiers, robot_belief)
+            self.nodes_list.append(node)
+        self.node_utility = []
+        for i, coords in enumerate(self.node_coords):
+            utility = self.nodes_list[i].utility
+            self.node_utility.append(utility)
+        self.node_utility = np.array(self.node_utility)
+        self.guidepost = np.zeros((self.node_coords.shape[0], 1))
+        x = self.node_coords[:, 0] + self.node_coords[:, 1] * 1j
+        for node in self.route_node:
+            index = np.argwhere(x.reshape(-1) == node[0] + node[1] * 1j)
+            self.guidepost[index] = 1
+        return self.node_coords, self.graph.edges, self.node_utility, self.guidepost
+    def generate_uniform_points(self):
+        # x = np.linspace(0, self.map_x - 1, NUM_COORDS_WIDTH).round().astype(int)
+        # y = np.linspace(0, self.map_y - 1, NUM_COORDS_HEIGHT).round().astype(int)
+        padding_x = 0.5 * (self.map_x / NUM_COORDS_WIDTH)
+        padding_y = 0.5 * (self.map_y / NUM_COORDS_HEIGHT)
+        x = np.linspace(padding_x, self.map_x - padding_x - 1, NUM_COORDS_WIDTH).round().astype(int)
+        y = np.linspace(padding_y, self.map_y - padding_y - 1, NUM_COORDS_HEIGHT).round().astype(int)
+        t1, t2 = np.meshgrid(x, y)
+        points = np.vstack([t1.T.ravel(), t2.T.ravel()]).T
+        matrix = np.stack((t1, t2), axis=-1)
+        return points, matrix
+    def free_area(self, robot_belief):
+        index = np.where(robot_belief == 255)
+        free = np.asarray([index[1], index[0]]).T
+        return free
+    def unique_coords(self, coords):
+        x = coords[:, 0] + coords[:, 1] * 1j
+        indices = np.unique(x, return_index=True)[1]
+        coords = np.array([coords[idx] for idx in sorted(indices)])
+        return coords
+    def find_k_neighbor(self, coords, node_coords, robot_belief):
+        dist_list = np.linalg.norm((coords-node_coords), axis=-1)
+        sorted_index = np.argsort(dist_list)
+        k = 0
+        neighbor_index_list = []
+        while k < self.k_size and k< node_coords.shape[0]:
+            neighbor_index = sorted_index[k]
+            neighbor_index_list.append(neighbor_index)
+            dist = dist_list[k]
+            start = coords
+            end = node_coords[neighbor_index]
+            if not self.check_collision(start, end, robot_belief):
+                a = str(self.find_index_from_coords(node_coords, start))
+                b = str(neighbor_index)
+                self.graph.add_node(a)
+                self.graph.add_edge(a, b, dist)
+                if self.plot:
+                    self.x.append([start[0], end[0]])
+                    self.y.append([start[1], end[1]])
+            k += 1
+        return neighbor_index_list
+    def find_k_neighbor_all_nodes(self, node_coords, robot_belief):
+        X = node_coords
+        if len(node_coords) >= self.k_size:
+            knn = NearestNeighbors(n_neighbors=self.k_size)
+        else:
+            knn = NearestNeighbors(n_neighbors=len(node_coords))
+        knn.fit(X)
+        distances, indices = knn.kneighbors(X)
+        for i, p in enumerate(X):
+            for j, neighbour in enumerate(X[indices[i][:]]):
+                start = p
+                end = neighbour
+                if not self.check_collision(start, end, robot_belief):
+                    a = str(self.find_index_from_coords(node_coords, p))
+                    b = str(self.find_index_from_coords(node_coords, neighbour))
+                    self.graph.add_node(a)
+                    self.graph.add_edge(a, b, distances[i, j])
+                    if self.plot:
+                        self.x.append([p[0], neighbour[0]])
+                        self.y.append([p[1], neighbour[1]])
+    def find_nearest_neighbor_all_nodes(self, node_coords, robot_belief):
+        for i, p in enumerate(node_coords):
+            filtered_coords = self.get_neighbors_grid_coords(p)
+            for j, neighbour in enumerate(filtered_coords):
+                start = p
+                end = neighbour
+                if not self.check_collision(start, end, robot_belief):
+                    a = str(self.find_index_from_coords(node_coords, p))
+                    b = str(self.find_index_from_coords(node_coords, neighbour))
+                    self.graph.add_node(a)
+                    self.graph.add_edge(a, b, np.linalg.norm(start-end))
+                    if self.plot:
+                        self.x.append([p[0], neighbour[0]])
+                        self.y.append([p[1], neighbour[1]])
+    def find_index_from_coords(self, node_coords, p):
+        return np.where(np.linalg.norm(node_coords - p, axis=1) < 1e-5)[0][0]
+    def find_closest_index_from_coords(self, node_coords, p):
+        return np.argmin(np.linalg.norm(node_coords - p, axis=1))
+    def find_index_from_grid_coords_2d(self, p):
+        # Calculate the distance between the target coord and each point in grid_coords
+        diffs = np.linalg.norm(self.grid_coords - p, axis=2)  # Compute along the last axis (x, y)
+        indices = np.where(diffs < 1e-5)
+        # Return the 2D index as a tuple (row, col)
+        if indices[0].size > 0:
+            return indices[0][0], indices[1][0]
+        else:
+            raise ValueError(f"Coordinate {p} not found in self.grid_coords.")
+    def find_closest_index_from_grid_coords_2d(self, p):
+        # Calculate the distance between the target coord and each point in grid_coords
+        distances = np.linalg.norm(self.grid_coords - p, axis=2)  # Compute along the last axis (x, y)
+        flat_index = np.argmin(distances)
+        return np.unravel_index(flat_index, distances.shape)
+    def check_collision(self, start, end, robot_belief):
+        collision = False
+        line = shapely.geometry.LineString([start, end])
+        sortx = np.sort([start[0], end[0]])
+        sorty = np.sort([start[1], end[1]])
+        # print(robot_belief.shape)
+        robot_belief = robot_belief[sorty[0]:sorty[1]+1, sortx[0]:sortx[1]+1]
+        occupied_area_index = np.where(robot_belief == 1)
+        occupied_area_coords = np.asarray([occupied_area_index[1]+sortx[0], occupied_area_index[0]+sorty[0]]).T
+        unexplored_area_index = np.where(robot_belief == 127)
+        unexplored_area_coords = np.asarray([unexplored_area_index[1]+sortx[0], unexplored_area_index[0]+sorty[0]]).T
+        unfree_area_coords = np.concatenate((occupied_area_coords, unexplored_area_coords))
+        # obstacles = []
+        for i in range(unfree_area_coords.shape[0]):
+            coords = ([(unfree_area_coords[i][0], unfree_area_coords[i][1]),
+                       (unfree_area_coords[i][0] + 1, unfree_area_coords[i][1]),
+                       (unfree_area_coords[i][0], unfree_area_coords[i][1] + 1),
+                       (unfree_area_coords[i][0] + 1, unfree_area_coords[i][1] + 1)])
+            obstacle = shapely.geometry.Polygon(coords)
+            #obstacles.append(obstacle)
+        # if obstacles != []:
+            #all_obstacles = shapely.geometry.MultiPolygon(obstacles)
+            # print(obstacle.is_valid)
+            collision = line.intersects(obstacle)
+            if collision:
+                break
+        return collision
+    def find_shortest_path(self, current, destination, node_coords):
+        start_node = str(self.find_index_from_coords(node_coords, current))
+        end_node = str(self.find_index_from_coords(node_coords, destination))
+        route, dist = a_star(int(start_node), int(end_node), self.node_coords, self.graph)
+        if start_node != end_node:
+            assert route != []
+        # t3 = time.time()
+        # print(t2-t1, t3-t2)
+        route = list(map(str, route))
+        return dist, route
+    # def get_neighbors_grid_coords(self, coord):
+    #     # Return the 4 closest neighbors (N, S, E, W) of a given coordinate
+    #     nearest_coord = self.node_coords[self.find_closest_index_from_coords(self.node_coords, coord)]
+    #     rows, cols = self.grid_coords.shape[:2]
+    #     neighbors = []
+    #     i, j = self.find_index_from_grid_coords_2d(nearest_coord)
+    #     # Define NSEW neighbor offsets
+    #     directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # N, S, W, E
+    #     for di, dj in directions:
+    #         ni, nj = i + di, j + dj
+    #         if 0 <= ni < rows and 0 <= nj < cols:
+    #             neighbors.append(tuple(self.grid_coords[ni, nj]))
+    #     return neighbors
+    def get_neighbors_grid_coords(self, coord):
+        # Return the 8 closest neighbors of a given coordinate
+        nearest_coord = self.node_coords[self.find_closest_index_from_coords(self.node_coords, coord)]
+        rows, cols = self.grid_coords.shape[:2]
+        neighbors = []
+        i, j  = self.find_index_from_grid_coords_2d(nearest_coord)
+        # Create a range of indices for rows and columns
+        row_range = np.clip([i - 1, i, i + 1], 0, rows - 1)
+        col_range = np.clip([j - 1, j, j + 1], 0, cols - 1)
+        # Iterate over the valid indices
+        for ni in row_range:
+            for nj in col_range:
+                if (ni, nj) != (i, j):  # Skip the center point
+                    neighbors.append(tuple(self.grid_coords[ni, nj]))
+        return neighbors

inference/model/STAGE1_vlm_search_24x24_040425_no_tgt_rewards_iNAT_DS_16k.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44e642df9aaa2847ba44dd4707985c67ef712f5264272ef7993aeb7805c80f5a
+size 52167246

model.py ADDED Viewed

	@@ -0,0 +1,319 @@

+#######################################################################
+# Name: model.py
+#
+# - Attention-based encoders & decoders
+# - Policy Net: Input = Augmented Graph, Output = Node to go to
+# - Critic Net: Input = Augmented Graph + Action, Output = Q_Value
+#######################################################################
+import torch
+import torch.nn as nn
+import math
+class SingleHeadAttention(nn.Module):
+    def __init__(self, embedding_dim):
+        super(SingleHeadAttention, self).__init__()
+        self.input_dim = embedding_dim
+        self.embedding_dim = embedding_dim
+        self.value_dim = embedding_dim
+        self.key_dim = self.value_dim
+        self.tanh_clipping = 10
+        self.norm_factor = 1 / math.sqrt(self.key_dim)
+        self.w_query = nn.Parameter(torch.Tensor(self.input_dim, self.key_dim))
+        self.w_key = nn.Parameter(torch.Tensor(self.input_dim, self.key_dim))
+        self.init_parameters()
+    def init_parameters(self):
+        for param in self.parameters():
+            stdv = 1. / math.sqrt(param.size(-1))
+            param.data.uniform_(-stdv, stdv)
+    def forward(self, q, k, mask=None):
+        n_batch, n_key, n_dim = k.size()
+        n_query = q.size(1)
+        k_flat = k.reshape(-1, n_dim)
+        q_flat = q.reshape(-1, n_dim)
+        shape_k = (n_batch, n_key, -1)
+        shape_q = (n_batch, n_query, -1)
+        Q = torch.matmul(q_flat, self.w_query).view(shape_q)
+        K = torch.matmul(k_flat, self.w_key).view(shape_k)
+        U = self.norm_factor * torch.matmul(Q, K.transpose(1, 2))
+        U = self.tanh_clipping * torch.tanh(U)
+        if mask is not None:
+            U = U.masked_fill(mask == 1, -1e8)
+        attention = torch.log_softmax(U, dim=-1)  # n_batch*n_query*n_key
+        return attention
+class MultiHeadAttention(nn.Module):
+    def __init__(self, embedding_dim, n_heads=8):
+        super(MultiHeadAttention, self).__init__()
+        self.n_heads = n_heads
+        self.input_dim = embedding_dim
+        self.embedding_dim = embedding_dim
+        self.value_dim = self.embedding_dim // self.n_heads
+        self.key_dim = self.value_dim
+        self.norm_factor = 1 / math.sqrt(self.key_dim)
+        self.w_query = nn.Parameter(torch.Tensor(self.n_heads, self.input_dim, self.key_dim))
+        self.w_key = nn.Parameter(torch.Tensor(self.n_heads, self.input_dim, self.key_dim))
+        self.w_value = nn.Parameter(torch.Tensor(self.n_heads, self.input_dim, self.value_dim))
+        self.w_out = nn.Parameter(torch.Tensor(self.n_heads, self.value_dim, self.embedding_dim))
+        self.init_parameters()
+    def init_parameters(self):
+        for param in self.parameters():
+            stdv = 1. / math.sqrt(param.size(-1))
+            param.data.uniform_(-stdv, stdv)
+    def forward(self, q, k=None, v=None, key_padding_mask=None, attn_mask=None):
+        if k is None:
+            k = q
+        if v is None:
+            v = q
+        n_batch, n_key, n_dim = k.size()
+        n_query = q.size(1)
+        n_value = v.size(1)
+        k_flat = k.contiguous().view(-1, n_dim)
+        v_flat = v.contiguous().view(-1, n_dim)
+        q_flat = q.contiguous().view(-1, n_dim)
+        shape_v = (self.n_heads, n_batch, n_value, -1)
+        shape_k = (self.n_heads, n_batch, n_key, -1)
+        shape_q = (self.n_heads, n_batch, n_query, -1)
+        Q = torch.matmul(q_flat, self.w_query).view(shape_q)  # n_heads*batch_size*n_query*key_dim
+        K = torch.matmul(k_flat, self.w_key).view(shape_k)  # n_heads*batch_size*targets_size*key_dim
+        V = torch.matmul(v_flat, self.w_value).view(shape_v)  # n_heads*batch_size*targets_size*value_dim
+        U = self.norm_factor * torch.matmul(Q, K.transpose(2, 3))  # n_heads*batch_size*n_query*targets_size
+        if attn_mask is not None:
+            attn_mask = attn_mask.view(1, n_batch, n_query, n_key).expand_as(U)
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask.repeat(1, n_query, 1)
+            key_padding_mask = key_padding_mask.view(1, n_batch, n_query, n_key).expand_as(U)  # copy for n_heads times
+        if attn_mask is not None and key_padding_mask is not None:
+            mask = (attn_mask + key_padding_mask)
+        elif attn_mask is not None:
+            mask = attn_mask
+        elif key_padding_mask is not None:
+            mask = key_padding_mask
+        else:
+            mask = None
+        if mask is not None:
+            U = U.masked_fill(mask > 0, -1e8)
+        attention = torch.softmax(U, dim=-1)  # n_heads*batch_size*n_query*targets_size
+        heads = torch.matmul(attention, V)  # n_heads*batch_size*n_query*value_dim
+        # out = heads.permute(1, 2, 0, 3).reshape(n_batch, n_query, n_dim)
+        out = torch.mm(
+            heads.permute(1, 2, 0, 3).reshape(-1, self.n_heads * self.value_dim),
+            # batch_size*n_query*n_heads*value_dim
+            self.w_out.view(-1, self.embedding_dim)
+            # n_heads*value_dim*embedding_dim
+        ).view(-1, n_query, self.embedding_dim)
+        return out, attention  # batch_size*n_query*embedding_dim
+class Normalization(nn.Module):
+    def __init__(self, embedding_dim):
+        super(Normalization, self).__init__()
+        self.normalizer = nn.LayerNorm(embedding_dim)
+    def forward(self, input):
+        return self.normalizer(input.view(-1, input.size(-1))).view(*input.size())
+class EncoderLayer(nn.Module):
+    def __init__(self, embedding_dim, n_head):
+        super(EncoderLayer, self).__init__()
+        self.multiHeadAttention = MultiHeadAttention(embedding_dim, n_head)
+        self.normalization1 = Normalization(embedding_dim)
+        self.feedForward = nn.Sequential(nn.Linear(embedding_dim, 512), nn.ReLU(inplace=True),
+                                         nn.Linear(512, embedding_dim))
+        self.normalization2 = Normalization(embedding_dim)
+    def forward(self, src, key_padding_mask=None, attn_mask=None):
+        h0 = src
+        h = self.normalization1(src)
+        h, _ = self.multiHeadAttention(q=h, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        h = h + h0
+        h1 = h
+        h = self.normalization2(h)
+        h = self.feedForward(h)
+        h2 = h + h1
+        return h2
+class DecoderLayer(nn.Module):
+    def __init__(self, embedding_dim, n_head):
+        super(DecoderLayer, self).__init__()
+        self.multiHeadAttention = MultiHeadAttention(embedding_dim, n_head)
+        self.normalization1 = Normalization(embedding_dim)
+        self.feedForward = nn.Sequential(nn.Linear(embedding_dim, 512),
+                                         nn.ReLU(inplace=True),
+                                         nn.Linear(512, embedding_dim))
+        self.normalization2 = Normalization(embedding_dim)
+    def forward(self, tgt, memory, key_padding_mask=None, attn_mask=None):
+        h0 = tgt
+        tgt = self.normalization1(tgt)
+        memory = self.normalization1(memory)
+        h, w = self.multiHeadAttention(q=tgt, k=memory, v=memory, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        h = h + h0
+        h1 = h
+        h = self.normalization2(h)
+        h = self.feedForward(h)
+        h2 = h + h1
+        return h2, w
+class Encoder(nn.Module):
+    def __init__(self, embedding_dim=128, n_head=8, n_layer=1):
+        super(Encoder, self).__init__()
+        self.layers = nn.ModuleList(EncoderLayer(embedding_dim, n_head) for i in range(n_layer))
+    def forward(self, src, key_padding_mask=None, attn_mask=None):
+        for layer in self.layers:
+            src = layer(src, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        return src
+class Decoder(nn.Module):
+    def __init__(self, embedding_dim=128, n_head=8, n_layer=1):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList([DecoderLayer(embedding_dim, n_head) for i in range(n_layer)])
+    def forward(self, tgt, memory, key_padding_mask=None, attn_mask=None):
+        for layer in self.layers:
+            tgt, w = layer(tgt, memory, key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        return tgt, w
+class PolicyNet(nn.Module):
+    def __init__(self, input_dim, embedding_dim):
+        super(PolicyNet, self).__init__()
+        self.initial_embedding = nn.Linear(input_dim, embedding_dim) # layer for non-end position
+        self.current_embedding = nn.Linear(embedding_dim * 2, embedding_dim)
+        self.encoder = Encoder(embedding_dim=embedding_dim, n_head=8, n_layer=6)
+        self.decoder = Decoder(embedding_dim=embedding_dim, n_head=8, n_layer=1)
+        self.pointer = SingleHeadAttention(embedding_dim)
+    def encode_graph(self, node_inputs, node_padding_mask, edge_mask):
+        node_feature = self.initial_embedding(node_inputs)
+        enhanced_node_feature = self.encoder(src=node_feature, key_padding_mask=node_padding_mask, attn_mask=edge_mask)
+        return enhanced_node_feature
+    def output_policy(self, enhanced_node_feature, edge_inputs, current_index, edge_padding_mask, node_padding_mask):
+        k_size = edge_inputs.size()[2]
+        current_edge = torch.gather(edge_inputs, 1, current_index.repeat(1, 1, k_size))
+        current_edge = current_edge.permute(0, 2, 1)
+        embedding_dim = enhanced_node_feature.size()[2]
+        neigboring_feature = torch.gather(enhanced_node_feature, 1, current_edge.repeat(1, 1, embedding_dim))
+        current_node_feature = torch.gather(enhanced_node_feature, 1, current_index.repeat(1, 1, embedding_dim))
+        if edge_padding_mask is not None:
+            current_mask = torch.gather(edge_padding_mask, 1, current_index.repeat(1,1,k_size)).to(enhanced_node_feature.device)
+            # print(current_mask)
+        else:
+            current_mask = None
+        current_mask[:,:,0] = 1 # don't stay at current position
+        # ADDED: If nowhere to go, then STAY at current position
+        # #assert 0 in current_mask     # Will cause sim to crash
+        if not 0 in current_mask:
+            current_mask[:,:,0] = 0
+        enhanced_current_node_feature, _ = self.decoder(current_node_feature, enhanced_node_feature, node_padding_mask)
+        enhanced_current_node_feature = self.current_embedding(torch.cat((enhanced_current_node_feature, current_node_feature), dim=-1))
+        logp = self.pointer(enhanced_current_node_feature, neigboring_feature, current_mask)
+        logp= logp.squeeze(1) # batch_size*k_size
+        return logp
+    def forward(self, node_inputs, edge_inputs, current_index, node_padding_mask=None, edge_padding_mask=None, edge_mask=None):
+        enhanced_node_feature = self.encode_graph(node_inputs, node_padding_mask, edge_mask)
+        logp = self.output_policy(enhanced_node_feature, edge_inputs, current_index, edge_padding_mask, node_padding_mask)
+        return logp
+class QNet(nn.Module):
+    def __init__(self, input_dim, embedding_dim):
+        super(QNet, self).__init__()
+        self.initial_embedding = nn.Linear(input_dim, embedding_dim) # layer for non-end position
+        self.action_embedding = nn.Linear(embedding_dim*3, embedding_dim)
+        self.encoder = Encoder(embedding_dim=embedding_dim, n_head=8, n_layer=6)
+        self.decoder = Decoder(embedding_dim=embedding_dim, n_head=8, n_layer=1)
+        self.q_values_layer = nn.Linear(embedding_dim, 1)
+    def encode_graph(self, node_inputs, node_padding_mask, edge_mask):
+        embedding_feature = self.initial_embedding(node_inputs)
+        embedding_feature = self.encoder(src=embedding_feature, key_padding_mask=node_padding_mask, attn_mask=edge_mask)
+        return embedding_feature
+    def output_q_values(self, enhanced_node_feature, edge_inputs, current_index, edge_padding_mask, node_padding_mask):
+        k_size = edge_inputs.size()[2]
+        current_edge = torch.gather(edge_inputs, 1, current_index.repeat(1, 1, k_size))
+        current_edge = current_edge.permute(0, 2, 1)
+        embedding_dim = enhanced_node_feature.size()[2]
+        neigboring_feature = torch.gather(enhanced_node_feature, 1, current_edge.repeat(1, 1, embedding_dim))
+        current_node_feature = torch.gather(enhanced_node_feature, 1, current_index.repeat(1, 1, embedding_dim))
+        enhanced_current_node_feature, attention_weights = self.decoder(current_node_feature, enhanced_node_feature, node_padding_mask)
+        action_features = torch.cat((enhanced_current_node_feature.repeat(1, k_size, 1), current_node_feature.repeat(1, k_size, 1), neigboring_feature), dim=-1)
+        action_features = self.action_embedding(action_features)
+        q_values = self.q_values_layer(action_features)
+        if edge_padding_mask is not None:
+            current_mask = torch.gather(edge_padding_mask, 1, current_index.repeat(1, 1, k_size)).to(
+                enhanced_node_feature.device)
+        else:
+            current_mask = None
+        current_mask[:, :, 0] = 1  # don't stay at current position
+        # assert 0 in current_mask  # Will cause sim to crash
+        if not 0 in current_mask:
+            current_mask[:,:,0] = 0
+        current_mask = current_mask.permute(0, 2, 1)
+        zero = torch.zeros_like(q_values).to(q_values.device)
+        q_values = torch.where(current_mask == 1, zero, q_values)
+        return q_values, attention_weights
+    def forward(self, node_inputs, edge_inputs, current_index, node_padding_mask=None, edge_padding_mask=None,
+                edge_mask=None):
+        enhanced_node_feature = self.encode_graph(node_inputs, node_padding_mask, edge_mask)
+        q_values, attention_weights = self.output_q_values(enhanced_node_feature, edge_inputs, current_index, edge_padding_mask, node_padding_mask)
+        return q_values, attention_weights

node.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#######################################################################
+# Name: node.py
+#
+# - Contains info per node on graph (edge)
+# - Contains: Position, Utility, Visitation History
+#######################################################################
+import sys
+if sys.modules['TRAINING']:
+    from parameter import *
+else:
+    from test_parameter import *
+import numpy as np
+import shapely.geometry
+class Node():
+    def __init__(self, coords, frontiers, robot_belief):
+        self.coords = coords
+        self.observable_frontiers = []
+        self.sensor_range = SENSOR_RANGE
+        self.initialize_observable_frontiers(frontiers, robot_belief)
+        self.utility = self.get_node_utility()
+        if self.utility == 0:
+            self.zero_utility_node = True
+        else:
+            self.zero_utility_node = False
+    def initialize_observable_frontiers(self, frontiers, robot_belief):
+        dist_list = np.linalg.norm(frontiers - self.coords, axis=-1)
+        frontiers_in_range = frontiers[dist_list < self.sensor_range - 10]
+        for point in frontiers_in_range:
+            collision = self.check_collision(self.coords, point, robot_belief)
+            if not collision:
+                self.observable_frontiers.append(point)
+    def get_node_utility(self):
+        return len(self.observable_frontiers)
+    def update_observable_frontiers(self, observed_frontiers, new_frontiers, robot_belief):
+        if observed_frontiers != []:
+            observed_index = []
+            for i, point in enumerate(self.observable_frontiers):
+                if point[0] + point[1] * 1j in observed_frontiers[:, 0] + observed_frontiers[:, 1] * 1j:
+                    observed_index.append(i)
+            for index in reversed(observed_index):
+                self.observable_frontiers.pop(index)
+        #
+        if new_frontiers != []:
+            dist_list = np.linalg.norm(new_frontiers - self.coords, axis=-1)
+            new_frontiers_in_range = new_frontiers[dist_list < self.sensor_range - 15]
+            for point in new_frontiers_in_range:
+                collision = self.check_collision(self.coords, point, robot_belief)
+                if not collision:
+                    self.observable_frontiers.append(point)
+        self.utility = self.get_node_utility()
+        if self.utility == 0:
+            self.zero_utility_node = True
+        else:
+            self.zero_utility_node = False
+    def set_visited(self):
+        self.observable_frontiers = []
+        self.utility = 0
+        self.zero_utility_node = True
+    def check_collision(self, start, end, robot_belief):
+        collision = False
+        line = shapely.geometry.LineString([start, end])
+        sortx = np.sort([start[0], end[0]])
+        sorty = np.sort([start[1], end[1]])
+        # print(robot_belief.shape)
+        robot_belief = robot_belief[sorty[0]:sorty[1] + 1, sortx[0]:sortx[1] + 1]
+        occupied_area_index = np.where(robot_belief == 1)
+        occupied_area_coords = np.asarray(
+            [occupied_area_index[1] + sortx[0], occupied_area_index[0] + sorty[0]]).T
+        unexplored_area_index = np.where(robot_belief == 127)
+        unexplored_area_coords = np.asarray(
+            [unexplored_area_index[1] + sortx[0], unexplored_area_index[0] + sorty[0]]).T
+        unfree_area_coords = np.concatenate((occupied_area_coords, unexplored_area_coords))
+        # obstacles = []
+        for i in range(unfree_area_coords.shape[0]):
+            coords = ([(unfree_area_coords[i][0], unfree_area_coords[i][1]),
+                       (unfree_area_coords[i][0] + 1, unfree_area_coords[i][1]),
+                       (unfree_area_coords[i][0], unfree_area_coords[i][1] + 1),
+                       (unfree_area_coords[i][0] + 1, unfree_area_coords[i][1] + 1)])
+            obstacle = shapely.geometry.Polygon(coords)
+            # obstacles.append(obstacle)
+            # if obstacles != []:
+            # all_obstacles = shapely.geometry.MultiPolygon(obstacles)
+            # print(obstacle.is_valid)
+            collision = line.intersects(obstacle)
+            if collision:
+                break
+        return collision

robot.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#######################################################################
+# Name: robot.py
+#
+# - Stores S(t), A(t), R(t), S(t+1)
+#######################################################################
+from copy import deepcopy
+import torch
+class Robot:
+    def __init__(self, robot_id, position, plot=False):
+        self.robot_id = robot_id
+        self.plot = plot
+        self.travel_dist = 0
+        self.robot_position = position
+        self.observations = None
+        self.trajectory_coords = []
+        self.targets_found_on_path = []
+        self.episode_buffer = []
+        for i in range(15):
+            self.episode_buffer.append([])
+        if self.plot:
+            # initialize the route
+            self.xPoints = [self.robot_position[0]]
+            self.yPoints = [self.robot_position[1]]
+    def save_observations(self, observations):
+        node_inputs, edge_inputs, current_index, node_padding_mask, edge_padding_mask, edge_mask = observations
+        self.episode_buffer[0] += deepcopy(node_inputs).to('cpu')
+        self.episode_buffer[1] += deepcopy(edge_inputs).to('cpu')
+        self.episode_buffer[2] += deepcopy(current_index).to('cpu')
+        self.episode_buffer[3] += deepcopy(node_padding_mask).to('cpu')
+        self.episode_buffer[4] += deepcopy(edge_padding_mask).to('cpu')
+        self.episode_buffer[5] += deepcopy(edge_mask).to('cpu')
+    def save_action(self, action_index):
+        self.episode_buffer[6] += action_index.unsqueeze(0).unsqueeze(0)
+    def save_reward_done(self, reward, done):
+        self.episode_buffer[7] += deepcopy(torch.FloatTensor([[[reward]]])).to('cpu')
+        self.episode_buffer[8] += deepcopy(torch.tensor([[[(int(done))]]])).to('cpu')
+        if self.plot:
+            self.xPoints.append(self.robot_position[0])
+            self.yPoints.append(self.robot_position[1])
+    def save_next_observations(self, observations):
+        node_inputs, edge_inputs, current_index, node_padding_mask, edge_padding_mask, edge_mask = observations
+        self.episode_buffer[9] += deepcopy(node_inputs).to('cpu')
+        self.episode_buffer[10] += deepcopy(edge_inputs).to('cpu')
+        self.episode_buffer[11] += deepcopy(current_index).to('cpu')
+        self.episode_buffer[12] += deepcopy(node_padding_mask).to('cpu')
+        self.episode_buffer[13] += deepcopy(edge_padding_mask).to('cpu')
+        self.episode_buffer[14] += deepcopy(edge_mask).to('cpu')
+    # NEW: ADDED TO SAVE TRAJECTORY COORDS DURING TTA
+    def save_trajectory_coords(self, robot_position_coords, num_target_found):
+        self.trajectory_coords.append(robot_position_coords)
+        self.targets_found_on_path.append(num_target_found)

sensor.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#######################################################################
+# Name: sensor.py
+#
+# - Computes sensor related checks (e.g. collision, utility etc)
+#######################################################################
+import sys
+if sys.modules['TRAINING']:
+    from parameter import *
+else:
+    from test_parameter import *
+import math
+import numpy as np
+import copy
+def collision_check(x0, y0, x1, y1, ground_truth, robot_belief):
+    x0 = x0.round()
+    y0 = y0.round()
+    x1 = x1.round()
+    y1 = y1.round()
+    dx, dy = abs(x1 - x0), abs(y1 - y0)
+    x, y = x0, y0
+    error = dx - dy
+    x_inc = 1 if x1 > x0 else -1
+    y_inc = 1 if y1 > y0 else -1
+    dx *= 2
+    dy *= 2
+    collision_flag = 0
+    max_collision = 10
+    while 0 <= x < ground_truth.shape[1] and 0 <= y < ground_truth.shape[0]:
+        k = ground_truth.item(y, x)
+        if k == 1 and collision_flag < max_collision:
+            collision_flag += 1
+            if collision_flag >= max_collision:
+                break
+        if k !=1 and collision_flag > 0:
+            break
+        if x == x1 and y == y1:
+            break
+        robot_belief.itemset((y, x), k)
+        if error > 0:
+            x += x_inc
+            error -= dy
+        else:
+            y += y_inc
+            error += dx
+    return robot_belief
+def sensor_work(robot_position, sensor_range, robot_belief, ground_truth, sensor_model=SENSOR_MODEL):
+    x0 = robot_position[0]
+    y0 = robot_position[1]
+    rng_x = 0.5 * (ground_truth.shape[1] / NUM_COORDS_WIDTH)
+    rng_y = 0.5 * (ground_truth.shape[0] / NUM_COORDS_HEIGHT)
+    if sensor_model == "rectangular":   # TODO: add collision check
+        max_x = min(x0 + int(math.ceil(rng_x)), ground_truth.shape[1])
+        min_x = max(x0 - int(math.ceil(rng_x)), 0)
+        max_y = min(y0 + int(math.ceil(rng_y)), ground_truth.shape[0])
+        min_y = max(y0 - int(math.ceil(rng_y)), 0)
+        robot_belief[min_y:max_y, min_x:max_x] = ground_truth[min_y:max_y, min_x:max_x]
+    else:
+        sensor_angle_inc = 0.5 / 180 * np.pi
+        sensor_angle = 0
+        while sensor_angle < 2 * np.pi:
+            x1 = x0 + np.cos(sensor_angle) * sensor_range
+            y1 = y0 + np.sin(sensor_angle) * sensor_range
+            robot_belief = collision_check(x0, y0, x1, y1, ground_truth, robot_belief)
+            sensor_angle += sensor_angle_inc
+    return robot_belief
+def unexplored_area_check(x0, y0, x1, y1, current_belief):
+    x0 = x0.round()
+    y0 = y0.round()
+    x1 = x1.round()
+    y1 = y1.round()
+    dx, dy = abs(x1 - x0), abs(y1 - y0)
+    x, y = x0, y0
+    error = dx - dy
+    x_inc = 1 if x1 > x0 else -1
+    y_inc = 1 if y1 > y0 else -1
+    dx *= 2
+    dy *= 2
+    while 0 <= x < current_belief.shape[1] and 0 <= y < current_belief.shape[0]:
+        k = current_belief.item(y, x)
+        if x == x1 and y == y1:
+            break
+        if k == 1:
+            break
+        if k == 127:
+            current_belief.itemset((y, x), 0)
+            break
+        if error > 0:
+            x += x_inc
+            error -= dy
+        else:
+            y += y_inc
+            error += dx
+    return current_belief
+def calculate_utility(waypoint_position, sensor_range, robot_belief):
+    sensor_angle_inc = 5 / 180 * np.pi
+    sensor_angle = 0
+    x0 = waypoint_position[0]
+    y0 = waypoint_position[1]
+    current_belief = copy.deepcopy(robot_belief)
+    while sensor_angle < 2 * np.pi:
+        x1 = x0 + np.cos(sensor_angle) * sensor_range
+        y1 = y0 + np.sin(sensor_angle) * sensor_range
+        current_belief = unexplored_area_check(x0, y0, x1, y1, current_belief)
+        sensor_angle += sensor_angle_inc
+    utility = np.sum(robot_belief == 127) - np.sum(current_belief == 127)
+    return utility

test_multi_robot_worker.py ADDED Viewed

	@@ -0,0 +1,727 @@

+#######################################################################
+# Name: multi_robot_worker.py
+#
+# - Runs robot in environment for N steps
+# - Collects & Returns S(t), A(t), R(t), S(t+1)
+# - NOTE: Applicable for multiple robots
+#######################################################################
+from pathlib import Path
+from test_parameter import *
+from time import time
+import imageio
+import csv
+import os
+import copy
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+import json
+from env import Env
+from model import PolicyNet
+from robot import Robot
+from Taxabind.Taxabind.SatBind.watershed_segmentation import WatershedBinomial
+from Taxabind.Taxabind.SatBind.kmeans_clustering import CombinedSilhouetteInertiaClusterer
+from Taxabind.Taxabind.SatBind.clip_seg_tta import ClipSegTTA
+np.seterr(invalid='raise', divide='raise')
+class TestWorker:
+    def __init__(self, meta_agent_id, n_agent, policy_net, global_step, device='cuda', greedy=False, save_image=False, clip_seg_tta=None):
+        self.device = device
+        self.greedy = greedy
+        self.n_agent = n_agent
+        self.metaAgentID = meta_agent_id
+        self.global_step = global_step
+        self.k_size = K_SIZE
+        self.save_image = save_image
+        self.tta = TAXABIND_TTA
+        self.clip_seg_tta = clip_seg_tta
+        self.env = Env(map_index=self.global_step, n_agent=n_agent, k_size=self.k_size, plot=save_image, test=True)
+        self.local_policy_net = policy_net
+        self.robot_list = []
+        self.all_robot_positions = []
+        for i in range(self.n_agent):
+            # robot_position = self.env.node_coords[i]
+            robot_position = self.env.start_positions[i]
+            robot = Robot(robot_id=i, position=robot_position, plot=save_image)
+            self.robot_list.append(robot)
+            self.all_robot_positions.append(robot_position)
+        self.perf_metrics = dict()
+        self.bad_mask_init = False
+        # # TEMP - EXPORT START POSES FOR BASELINES
+        # json_path = "eval_start_positions.json"
+        # sat_to_start_pose_dict = {}
+        # print("len(self.env.map_list): ", len(self.env.map_list))
+        # for i in range(4000):
+        #     print("i: ", i)
+        #     map_idx = i % len(self.env.map_list)
+        #     _, map_start_position = self.env.import_ground_truth(os.path.join(self.env.map_dir, self.env.map_list[map_idx]))
+        #     self.clip_seg_tta.reset(sample_idx=i)
+        #     sat_path = self.clip_seg_tta.gt_mask_name
+        #     sat_to_start_pose_dict[sat_path] = tuple(map(int, map_start_position))
+        # # Save to json
+        # with open(json_path, 'w') as f:
+        #     json.dump(sat_to_start_pose_dict, f)
+        # print("len(sat_to_start_pose_dict): ", len(sat_to_start_pose_dict))
+        # exit()
+        if self.tta:
+            # NOTE: Moved to test_driver.py for efficiency (avoid repeated init)
+            # self.clip_seg_tta = ClipSegTTA(
+            #     img_dir=TAXABIND_IMG_DIR,
+            #     imo_dir=TAXABIND_IMO_DIR,
+            #     json_path=TAXABIND_INAT_JSON_PATH,
+            #     sat_to_img_ids_json_path=TAXABIND_SAT_TO_IMG_IDS_JSON_PATH,
+            #     patch_size=TAXABIND_PATCH_SIZE,
+            #     sat_checkpoint_path=TAXABIND_SAT_CHECKPOINT_PATH,
+            #     sample_index = TAXABIND_SAMPLE_INDEX,   #global_step
+            #     blur_kernel = TAXABIND_GAUSSIAN_BLUR_KERNEL,
+            #     device=self.device,
+            #     sat_to_img_ids_json_is_train_dict=False # for search ds val
+            #     # sat_filtered_json_path=TAXABIND_FILTERED_INAT_JSON_PATH,
+            # )
+            if clip_seg_tta is not None:
+                self.clip_seg_tta.reset(sample_idx=self.global_step)
+                # print("Resetting for sample index: ", self.global_step)
+            # Override target positions in env
+            self.env.target_positions = [(pose[1], pose[0]) for pose in self.clip_seg_tta.target_positions] # Must transpose to (y, x) format
+            # Override GT seg mask for info_gain metric
+            if OVERRIDE_GT_MASK_DIR != "":
+                self.tta_gt_seg_path = os.path.join(OVERRIDE_GT_MASK_DIR, self.clip_seg_tta.gt_mask_name)
+                print("self.clip_seg_tta.gt_mask_name: ", self.clip_seg_tta.gt_mask_name)
+                if os.path.exists(self.tta_gt_seg_path):
+                    self.env.gt_segmentation_mask = self.env.import_segmentation_mask(self.tta_gt_seg_path)
+                else:
+                    print("\n\n!!!!!! WARNING: GT mask not found at path: ", self.tta_gt_seg_path)
+            if not USE_CLIP_PREDS and OVERRIDE_MASK_DIR != "":
+                # mask_name = self.clip_seg_tta.gt_mask_name.split('_')[:-TAX_HIERARCHY_TO_CONDENSE]
+                # mask_name = '_'.join(mask_name) + ".png"
+                score_mask_path = os.path.join(OVERRIDE_MASK_DIR, self.clip_seg_tta.gt_mask_name)
+                print("score_mask_path: ", score_mask_path)
+                if os.path.exists(score_mask_path):
+                    self.env.segmentation_mask = self.env.import_segmentation_mask(score_mask_path)
+                    self.env.begin(self.env.map_start_position)
+                else:
+                    print(f"\n\n{RED}!!!!!! ERROR: Trying to override, but score mask not found at path:{NC} ", score_mask_path)
+                    self.bad_mask_init = True
+            # # # # TEMP: Additional targets
+            # self.env.target_positions = []  # Reset
+            # print("self.env.target_positions", self.env.target_positions)
+            # # self.env.target_positions = [self.env.target_positions[2]]
+            # # self.env.target_positions = [self.env.target_positions[0]]
+            # # self.env.target_positions.append((251, 297))
+            # self.env.target_positions.append((40,40))
+            # self.env.target_positions.append((80,40))
+            # self.env.target_positions.append((120,40))
+            # self.env.target_positions.append((160,40))
+            # self.env.target_positions.append((200,40))
+            # Save clustered embeds from sat encoder
+            # In thery, we only need to do this once (same satellite map throughout)
+            if USE_CLIP_PREDS:
+                self.kmeans_clusterer = CombinedSilhouetteInertiaClusterer(
+                    k_min=1,
+                    k_max=8,
+                    k_avg_max=4,
+                    silhouette_threshold=0.15,
+                    relative_threshold=0.15,
+                    random_state=0,
+                    min_patch_size=5,   # smoothing parameter
+                    n_smooth_iter=2,    # smoothing parameter
+                    ignore_label=-1,
+                    plot=self.save_image,
+                    gifs_dir = gifs_path
+                )
+                # Fit & predict (this will also plot the clusters before & after smoothing)
+                map_shape = (int(np.sqrt(self.clip_seg_tta.patch_embeds.shape[0])), int(np.sqrt(self.clip_seg_tta.patch_embeds.shape[0])))
+                self.kmeans_sat_embeds_clusters = self.kmeans_clusterer.fit_predict(
+                    patch_embeds=self.clip_seg_tta.patch_embeds,
+                    map_shape=map_shape,
+                )
+                print("Chosen k:", self.kmeans_clusterer.final_k)
+                # print("Smoothed labels shape:", self.kmeans_sat_embeds_clusters.shape)
+                if EXECUTE_TTA:
+                    print("Will execute TTA...")
+            # Define Poisson TTA params
+            self.step_since_tta = 0
+            self.steps_to_first_tgt = None
+            self.steps_to_mid_tgt = None
+            self.steps_to_last_tgt = None
+            self.sim_0perc = None
+            self.sim_25perc = None
+            self.sim_50perc = None
+            self.sim_75perc = None
+            self.sim_100perc = None
+    def run_episode(self, curr_episode):
+        # Return all metrics as None if faulty mask init
+        if self.bad_mask_init:
+            self.perf_metrics['tax'] = None
+            self.perf_metrics['tax_first'] = None
+            self.perf_metrics['travel_dist'] = None
+            self.perf_metrics['travel_steps'] = None
+            self.perf_metrics['steps_to_first_tgt'] = None
+            self.perf_metrics['steps_to_mid_tgt'] = None
+            self.perf_metrics['steps_to_last_tgt'] = None
+            self.perf_metrics['explored_rate'] = None
+            self.perf_metrics['targets_found'] = None
+            self.perf_metrics['targets_total'] = None
+            self.perf_metrics['sim_0perc'] = None
+            self.perf_metrics['sim_25perc'] = None
+            self.perf_metrics['sim_50perc'] = None
+            self.perf_metrics['sim_75perc'] = None
+            self.perf_metrics['sim_100perc'] = None
+            self.perf_metrics['kmeans_k'] = None
+            self.perf_metrics['tgts_gt_score'] = None
+            self.perf_metrics['clip_inference_time'] = None
+            self.perf_metrics['tta_time'] = None
+            self.perf_metrics['info_gain'] = None
+            self.perf_metrics['total_info'] = None
+            self.perf_metrics['success_rate'] = None
+            return
+        eps_start = time()
+        done = False
+        for robot_id, deciding_robot in enumerate(self.robot_list):
+            deciding_robot.observations = self.get_observations(deciding_robot.robot_position)
+            if self.tta and USE_CLIP_PREDS:
+                self.env.segmentation_info_mask = np.expand_dims(self.clip_seg_tta.heatmap.T.flatten(), axis=1)
+                self.env.segmentation_info_mask_unnormalized = np.expand_dims(self.clip_seg_tta.heatmap_unnormalized.T.flatten(), axis=1)
+                # print("self.env.segmentation_info_mask.shape", self.env.segmentation_info_mask.shape)
+        ### Run episode for 128 steps ###
+        for step in range(NUM_EPS_STEPS):
+            # print("\n\n\n~~~~~~~~~~~~~~~~~~~~~~ Step: ", step, " ~~~~~~~~~~~~~~~~~~~~~~")
+            next_position_list = []
+            dist_list = []
+            travel_dist_list = []
+            dist_array = np.zeros((self.n_agent, 1))
+            for robot_id, deciding_robot in enumerate(self.robot_list):
+                observations = deciding_robot.observations
+                # if self.env.node_coords.shape[0] >= self.k_size:
+                # deciding_robot.save_observations(observations)
+                ### Forward pass through policy to get next position ###
+                next_position, action_index = self.select_node(observations)
+                # if self.env.node_coords.shape[0] >= self.k_size:
+                # deciding_robot.save_action(action_index)
+                dist = np.linalg.norm(next_position - deciding_robot.robot_position)
+                ### Log results of action (e.g. distance travelled) ###
+                dist_array[robot_id] = dist
+                dist_list.append(dist)
+                travel_dist_list.append(deciding_robot.travel_dist)
+                next_position_list.append(next_position)
+                self.all_robot_positions[robot_id] = next_position
+            arriving_sequence = np.argsort(dist_list)
+            next_position_list = np.array(next_position_list)
+            dist_list = np.array(dist_list)
+            travel_dist_list = np.array(travel_dist_list)
+            next_position_list = next_position_list[arriving_sequence]
+            dist_list = dist_list[arriving_sequence]
+            travel_dist_list = travel_dist_list[arriving_sequence]
+            ### Take Action (Deconflict if 2 agents choose the same target position) ###
+            next_position_list, dist_list = self.solve_conflict(arriving_sequence, next_position_list, dist_list)
+            # dist_travelled = np.linalg.norm(next_position - deciding_robot.robot_position)
+            # deciding_robot.travel_dist += dist_travelled
+            # deciding_robot.robot_position = next_position
+            reward_list, done = self.env.multi_robot_step(next_position_list, dist_list, travel_dist_list)
+            ### Update observations + rewards from action ###
+            for reward, robot_id in zip(reward_list, arriving_sequence):
+                robot = self.robot_list[robot_id]
+                robot.save_trajectory_coords(self.env.find_index_from_coords(robot.robot_position), self.env.num_new_targets_found)
+                # # TTA Update via Poisson Test (with KMeans clustering stats)
+                if self.tta and USE_CLIP_PREDS and EXECUTE_TTA:
+                    self.poisson_tta_update(robot, self.global_step, step)
+                robot.observations = self.get_observations(robot.robot_position)
+                # if self.env.node_coords.shape[0] >= self.k_size:
+                robot.save_reward_done(reward, done)
+                # robot.save_next_observations(robot.observations)
+                # Update metrics
+                # NOTE: For 1 robot for now
+                self.log_metrics(step=step) # robot.targets_found_on_path)
+            ### Save a frame to generate gif of robot trajectories ###
+            if self.save_image:
+                robots_route = []
+                for robot in self.robot_list:
+                    robots_route.append([robot.xPoints, robot.yPoints])
+                if not os.path.exists(gifs_path):
+                    os.makedirs(gifs_path)
+                sound_id_override = None if self.clip_seg_tta.sound_ids == [] else self.clip_seg_tta.sound_ids[0]
+                if TAXABIND_TTA and USE_CLIP_PREDS:
+                    self.env.plot_env(
+                        self.global_step,
+                        gifs_path,
+                        step,
+                        max(travel_dist_list),
+                        robots_route,
+                        img_path_override=self.clip_seg_tta.img_paths[0],    # Viz 1st
+                        sat_path_override=self.clip_seg_tta.imo_path,
+                        msk_name_override=self.clip_seg_tta.species_name,
+                        sound_id_override=sound_id_override,
+                        colormap_mid_val=np.max(self.clip_seg_tta.heatmap_unnormalized_initial)
+                    )
+                else:
+                    self.env.plot_env(
+                        self.global_step,
+                        gifs_path,
+                        step,
+                        max(travel_dist_list),
+                        robots_route,
+                        img_path_override=self.clip_seg_tta.img_paths[0],    # Viz 1st
+                        sat_path_override=self.clip_seg_tta.imo_path,
+                        msk_name_override=self.clip_seg_tta.species_name,
+                        sound_id_override=sound_id_override,
+                    )
+            if done:
+                break
+        if self.tta:
+            tax = Path(self.clip_seg_tta.gt_mask_name).stem
+            self.perf_metrics['tax'] = " ".join(tax.split("_")[1:])
+            self.perf_metrics['tax_first'] = tax.split("_")[1]
+        else:
+            self.perf_metrics['tax'] = None
+            self.perf_metrics['tax_first'] = None
+        self.perf_metrics['travel_dist'] = max(travel_dist_list)
+        self.perf_metrics['travel_steps'] = step + 1
+        self.perf_metrics['steps_to_first_tgt'] = self.steps_to_first_tgt
+        self.perf_metrics['steps_to_mid_tgt'] = self.steps_to_mid_tgt
+        self.perf_metrics['steps_to_last_tgt'] = self.steps_to_last_tgt
+        self.perf_metrics['explored_rate'] = self.env.explored_rate
+        self.perf_metrics['targets_found'] = self.env.targets_found_rate
+        self.perf_metrics['targets_total'] = len(self.env.target_positions)
+        self.perf_metrics['sim_0perc'] = self.sim_0perc
+        self.perf_metrics['sim_25perc'] = self.sim_25perc
+        self.perf_metrics['sim_50perc'] = self.sim_50perc
+        self.perf_metrics['sim_75perc'] = self.sim_75perc
+        self.perf_metrics['sim_100perc'] = self.sim_100perc
+        if USE_CLIP_PREDS:
+            self.perf_metrics['kmeans_k'] = self.kmeans_clusterer.final_k
+            self.perf_metrics['tgts_gt_score'] = self.clip_seg_tta.tgts_gt_score
+            self.perf_metrics['clip_inference_time'] = self.clip_seg_tta.clip_inference_time
+            self.perf_metrics['tta_time'] = self.clip_seg_tta.tta_time
+        else:
+            self.perf_metrics['kmeans_k'] = None
+            self.perf_metrics['tgts_gt_score'] = None
+            self.perf_metrics['clip_inference_time'] = None
+            self.perf_metrics['tta_time'] = None
+        if OVERRIDE_GT_MASK_DIR != "" and os.path.exists(self.tta_gt_seg_path):
+            self.perf_metrics['info_gain'] = self.env.info_gain
+            self.perf_metrics['total_info'] = self.env.total_info
+        else:
+            self.perf_metrics['info_gain'] = None
+            self.perf_metrics['total_info'] = None
+        if FORCE_LOGGING_DONE_TGTS_FOUND and self.env.targets_found_rate == 1.0:
+            self.perf_metrics['success_rate'] = True
+        else:
+            self.perf_metrics['success_rate'] = done
+        # save gif
+        if self.save_image:
+            path = gifs_path
+            self.make_gif(path, curr_episode)
+        print(YELLOW, f"[Eps {curr_episode} Completed] Time Taken: {time()-eps_start:.2f}s, Steps: {step+1}", NC)
+    def get_observations(self, robot_position):
+        """ Get robot's sensor observation of environment given position """
+        current_node_index = self.env.find_index_from_coords(robot_position)
+        current_index = torch.tensor([current_node_index]).unsqueeze(0).unsqueeze(0).to(self.device)  # (1,1,1)
+        node_coords = copy.deepcopy(self.env.node_coords)
+        graph = copy.deepcopy(self.env.graph)
+        node_utility = copy.deepcopy(self.env.node_utility)
+        guidepost = copy.deepcopy(self.env.guidepost)
+        # segmentation_info_mask = copy.deepcopy(self.env.segmentation_info_mask)
+        segmentation_info_mask = copy.deepcopy(self.env.filtered_seg_info_mask)
+        # ADDED - SEGMENTATION INFORATION MASK
+        n_nodes = node_coords.shape[0]
+        node_coords = node_coords / 640
+        node_utility = node_utility / 50
+        node_utility_inputs = node_utility.reshape((n_nodes, 1))
+        occupied_node = np.zeros((n_nodes, 1))
+        for position in self.all_robot_positions:
+            index = self.env.find_index_from_coords(position)
+            if index == current_index.item():
+                occupied_node[index] = -1
+            else:
+                occupied_node[index] = 1
+        # node_inputs = np.concatenate((node_coords, node_utility_inputs, guidepost, occupied_node), axis=1)
+        node_inputs = np.concatenate((node_coords, segmentation_info_mask, guidepost), axis=1)
+        # node_inputs = np.concatenate((node_coords, segmentation_info_mask, guidepost), axis=1)
+        node_inputs = torch.FloatTensor(node_inputs).unsqueeze(0).to(self.device)  # (1, node_padding_size+1, 3)
+        # assert node_coords.shape[0] < self.node_padding_size
+        # padding = torch.nn.ZeroPad2d((0, 0, 0, self.node_padding_size - node_coords.shape[0]))
+        # node_inputs = padding(node_inputs)
+        # node_padding_mask = torch.zeros((1, 1, node_coords.shape[0]), dtype=torch.int64).to(self.device)
+        # node_padding = torch.ones((1, 1, self.node_padding_size - node_coords.shape[0]), dtype=torch.int64).to(
+        #     self.device)
+        # node_padding_mask = torch.cat((node_padding_mask, node_padding), dim=-1)
+        node_padding_mask = None
+        graph = list(graph.values())
+        edge_inputs = []
+        for node in graph:
+            node_edges = list(map(int, node))
+            edge_inputs.append(node_edges)
+        bias_matrix = self.calculate_edge_mask(edge_inputs)
+        edge_mask = torch.from_numpy(bias_matrix).float().unsqueeze(0).to(self.device)
+        # assert len(edge_inputs) < self.node_padding_size
+        # padding = torch.nn.ConstantPad2d(
+        #     (0, self.node_padding_size - len(edge_inputs), 0, self.node_padding_size - len(edge_inputs)), 1)
+        # edge_mask = padding(edge_mask)
+        # padding2 = torch.nn.ZeroPad2d((0, 0, 0, self.node_padding_size - len(edge_inputs)))
+        for edges in edge_inputs:
+            while len(edges) < self.k_size:
+                edges.append(0)
+        edge_inputs = torch.tensor(edge_inputs).unsqueeze(0).to(self.device)  # (1, node_padding_size+1, k_size)
+        # edge_inputs = padding2(edge_inputs)
+        edge_padding_mask = torch.zeros((1, len(edge_inputs), K_SIZE), dtype=torch.int64).to(self.device)
+        one = torch.ones_like(edge_padding_mask, dtype=torch.int64).to(self.device)
+        edge_padding_mask = torch.where(edge_inputs == 0, one, edge_padding_mask)
+        observations = node_inputs, edge_inputs, current_index, node_padding_mask, edge_padding_mask, edge_mask
+        return observations
+    def select_node(self, observations):
+        """ Forward pass through policy to get next position to go to on map """
+        node_inputs, edge_inputs, current_index, node_padding_mask, edge_padding_mask, edge_mask = observations
+        with torch.no_grad():
+            logp_list = self.local_policy_net(node_inputs, edge_inputs, current_index, node_padding_mask, edge_padding_mask, edge_mask)
+        if self.greedy:
+            action_index = torch.argmax(logp_list, dim=1).long()
+        else:
+            action_index = torch.multinomial(logp_list.exp(), 1).long().squeeze(1)
+        next_node_index = edge_inputs[:, current_index.item(), action_index.item()]
+        next_position = self.env.node_coords[next_node_index]
+        return next_position, action_index
+    def solve_conflict(self, arriving_sequence, next_position_list, dist_list):
+        """ Deconflict if 2 agents choose the same target position """
+        for j, [robot_id, next_position] in enumerate(zip(arriving_sequence, next_position_list)):
+            moving_robot = self.robot_list[robot_id]
+            # if next_position[0] + next_position[1] * 1j in (next_position_list[:, 0] + next_position_list[:, 1] * 1j)[:j]:
+            #     dist_to_next_position = np.argsort(np.linalg.norm(self.env.node_coords - next_position, axis=1))
+            #     k = 0
+            #     while next_position[0] + next_position[1] * 1j in (next_position_list[:, 0] + next_position_list[:, 1] * 1j)[:j]:
+            #         k += 1
+            #         next_position = self.env.node_coords[dist_to_next_position[k]]
+            dist = np.linalg.norm(next_position - moving_robot.robot_position)
+            next_position_list[j] = next_position
+            dist_list[j] = dist
+            moving_robot.travel_dist += dist
+            moving_robot.robot_position = next_position
+        return next_position_list, dist_list
+    def work(self, currEpisode):
+        '''
+        Interacts with the environment. The agent gets either gradients or experience buffer
+        '''
+        self.run_episode(currEpisode)
+    def calculate_edge_mask(self, edge_inputs):
+        size = len(edge_inputs)
+        bias_matrix = np.ones((size, size))
+        for i in range(size):
+            for j in range(size):
+                if j in edge_inputs[i]:
+                    bias_matrix[i][j] = 0
+        return bias_matrix
+    def make_gif(self, path, n):
+        """ Generate a gif given list of images """
+        with imageio.get_writer('{}/{}_target_rate_{:.2f}.gif'.format(path, n, self.env.targets_found_rate), mode='I',
+                                fps=5) as writer:
+            for frame in self.env.frame_files:
+                image = imageio.imread(frame)
+                writer.append_data(image)
+        print('gif complete\n')
+        # Remove files
+        for filename in self.env.frame_files[:-1]:
+            os.remove(filename)
+        # For watershed segmenter gif during TTA
+        if self.tta:
+            # print("self.kmeans_clusterer.kmeans_frame_files", self.kmeans_clusterer.kmeans_frame_files)
+            with imageio.get_writer('{}/{}_kmeans_stats.gif'.format(path, n), mode='I',
+                                    fps=5) as writer:
+                for frame in self.kmeans_clusterer.kmeans_frame_files:
+                    image = imageio.imread(frame)
+                    writer.append_data(image)
+            print('Kmeans Clusterer gif complete\n')
+            # Remove files
+            for filename in self.kmeans_clusterer.kmeans_frame_files[:-1]:
+                os.remove(filename)
+    ################################################################################
+    # ADDED
+    ################################################################################
+    def log_metrics(self, step):
+        # Update tgt found metrics
+        if self.steps_to_first_tgt is None and self.env.num_targets_found == 1:
+            self.steps_to_first_tgt = step + 1
+        if self.steps_to_mid_tgt is None and self.env.num_targets_found == int(len(self.env.target_positions) / 2):
+            self.steps_to_mid_tgt = step + 1
+        if self.steps_to_last_tgt is None and self.env.num_targets_found == len(self.env.target_positions):
+            self.steps_to_last_tgt = step + 1
+        # Update sim metrics
+        if OVERRIDE_GT_MASK_DIR != "" and os.path.exists(self.tta_gt_seg_path):
+            side_dim = int(np.sqrt(self.env.segmentation_info_mask.shape[0]))
+            pred_mask = self.env.segmentation_info_mask.squeeze().reshape((side_dim, side_dim)).T
+            gt_mask = self.env.gt_segmentation_info_mask.squeeze().reshape((side_dim, side_dim)).T
+            if step == 0:
+                self.sim_0perc = self.norm_distance(pred_mask, gt_mask)
+            elif step == int(NUM_EPS_STEPS * 0.25):
+                self.sim_25perc = self.norm_distance(pred_mask, gt_mask)
+            elif step == int(NUM_EPS_STEPS * 0.5):
+                self.sim_50perc = self.norm_distance(pred_mask, gt_mask)
+            elif step == int(NUM_EPS_STEPS * 0.75):
+                self.sim_75perc = self.norm_distance(pred_mask, gt_mask)
+            elif step == NUM_EPS_STEPS - 1:
+                self.sim_100perc = self.norm_distance(pred_mask, gt_mask)
+    def norm_distance(self, P, Q, norm_type="L2"):
+        # Normalize both grids to [0,1]
+        try:
+            if P.max() != P.min():
+                P_norm = (P - P.min()) / (P.max() - P.min())
+            else:
+                P_norm = P
+            if Q.max() != Q.min():
+                Q_norm = (Q - Q.min()) / (Q.max() - Q.min())
+            else:
+                Q_norm = Q
+        except FloatingPointError as e:
+            print(f"{RED}Caught floating point error:{NC} {e}")
+            print("P min/max:", P.min(), P.max())
+            print("Q min/max:", Q.min(), Q.max())
+            print("Q: ", Q)
+            similarity = None
+            return similarity
+        if norm_type == "L1":
+            num_cells = P.shape[0] * P.shape[1]
+            # L1 distance: sum of absolute differences
+            l1_dist = np.sum(np.abs(P_norm - Q_norm))
+            # Normalize: maximum L1 distance is num_cells (if every cell differs by 1)
+            similarity = 1 - (l1_dist / num_cells)
+        elif norm_type == "L2":
+            # L2 distance via Root Mean Squared Error (RMSE)
+            rmse = np.sqrt(np.mean((P_norm - Q_norm)**2))
+            # Since both grids are in [0,1], maximum RMSE is 1.
+            similarity = 1 - rmse
+        else:
+            raise ValueError("norm_type must be either 'L1' or 'L2'")
+        return similarity
+    def transpose_flat_idx(self, idx, H= NUM_COORDS_HEIGHT, W=NUM_COORDS_WIDTH):
+        """
+        Given a flattened index X in an NxN matrix,
+        return the new index X' after transposing the matrix.
+        """
+        row = idx // W
+        col = idx % W
+        idx_T = col * H + row
+        return idx_T
+    def poisson_tta_update(self, robot, episode, step):
+        # TODO: Move into TTA loop to save computation
+        # Generate Kmeans Clusters Stats
+        visited_indices = [self.transpose_flat_idx(idx) for idx in robot.trajectory_coords]
+        region_stats_dict = self.kmeans_clusterer.compute_region_statistics(
+            self.kmeans_sat_embeds_clusters,
+            self.clip_seg_tta.heatmap_unnormalized,
+            visited_indices,
+            episode_num=episode,
+            step_num=step
+        )
+        # Prep & execute TTA
+        self.step_since_tta += 1
+        if robot.targets_found_on_path[-1] or self.step_since_tta % STEPS_PER_TTA == 0:    # Allow even if no positive at start
+            # for _ in range(NUM_TTA_STEPS):
+            filt_traj_coords = [self.transpose_flat_idx(idx) for idx in robot.trajectory_coords]
+            filt_targets_found_on_path = robot.targets_found_on_path
+            num_cells = self.clip_seg_tta.heatmap.shape[0] * self.clip_seg_tta.heatmap.shape[1]
+            # per_sample_weight_scale = [min(1.0, (step/num_cells)) for _ in filt_traj_coords]
+            # per_sample_weight_scale = [0.5 * min(1.0, (step/100)) for _ in filt_traj_coords]
+            # per_sample_weight_scale = [1.0 for _ in filt_traj_coords]
+            pos_sample_weight_scale, neg_sample_weight_scale = [], []
+            for i, sample_loc in enumerate(filt_traj_coords):
+                label = self.kmeans_clusterer.get_label_id(sample_loc)
+                num_patches = region_stats_dict[label]['num_patches']
+                patches_visited = region_stats_dict[label]['patches_visited']
+                expectation = region_stats_dict[label]['expectation']
+                ## BEST so far: exponent like focal loss to wait for more samples before confidently decreasing
+                pos_weight = 4.0 # 2.0
+                # pos_weight = 1.0 + 4.0 * min(1.0, (patches_visited/(num_patches))**GAMMA_EXPONENT)  # (1,5)
+                # pos_weight = 1.0 + 4.0 * min(1.0, (patches_visited/(3*expectation))**GAMMA_EXPONENT)
+                # neg_weight = min(1.0, (patches_visited/(3*expectation))**GAMMA_EXPONENT)
+                neg_weight = min(1.0, (patches_visited/(3*num_patches))**GAMMA_EXPONENT)  # (0,1)
+                pos_sample_weight_scale.append(pos_weight)
+                neg_sample_weight_scale.append(neg_weight)
+                ## Prelim throughts (BAD - quickly reduce low probs region even with little samples)
+                # neg_weight = min(1.0, patches_visited/(3*expectation))
+                # local_probs = self.kmeans_clusterer.get_probs(sample_loc, self.clip_seg_tta.heatmap)
+                # neg_weight = min(local_probs/2, patches_visited/(3*expectation))    # 2*expectation (if don't want TTA scheduler - 3x TTA)
+                # neg_weight = min(1.0, patches_visited/num_patches)    # 2*expectation (if don't want TTA scheduler - 3x TTA)
+                ## Hacky, but works better (does not decrase low probs region too fast)
+                # if label == 0:
+                #     neg_weight = min(0.5, patches_visited/(3*expectation))
+                # else:
+                #     neg_weight = min(0.05, patches_visited/(3*expectation))
+                # squared
+            # # # Adaptative LR (as samples increase, increase LR to fit more datapoints - else won't update)
+            adaptive_lr = MIN_LR + (MAX_LR - MIN_LR) * (step / num_cells)
+            # print("!!! adaptive_lr", adaptive_lr)
+            # adaptive_lr = 2e-6
+            # NOTE: Not as good as adaptive LR (cos discrete)
+            # # Num TTA teps schedulerq
+            # min_tta_steps = 3
+            # max_tta_steps = 10
+            # num_tta_steps = int((max_tta_steps - min_tta_steps) * (step / num_cells) + min_tta_steps)
+            # print("!!! num_tta_steps", num_tta_steps)
+            # TTA Update
+            self.clip_seg_tta.execute_tta(
+                filt_traj_coords,
+                filt_targets_found_on_path,
+                tta_steps=NUM_TTA_STEPS,
+                lr=adaptive_lr,
+                pos_sample_weight=pos_sample_weight_scale,
+                neg_sample_weight=neg_sample_weight_scale,
+                modality=MODALITY,
+                query_variety=QUERY_VARIETY,
+                target_found_idxs=self.env.target_found_idxs,
+                reset_weights=RESET_WEIGHTS
+            )
+            self.env.segmentation_info_mask = np.expand_dims(self.clip_seg_tta.heatmap.T.flatten(), axis=1)
+            self.env.segmentation_info_mask_unnormalized = np.expand_dims(self.clip_seg_tta.heatmap_unnormalized.T.flatten(), axis=1)
+            self.step_since_tta = 0
+    ################################################################################
+# if def main
+if __name__ == "__main__":
+    # CHANGE ME!
+    currEpisode = 0
+    # Prepare the model
+    # device = torch.device('cpu') #if USE_GPU_TRAINING else torch.device('cpu')
+    device = torch.device('cuda') if USE_GPU else torch.device('cpu')
+    policy_net = PolicyNet(INPUT_DIM, EMBEDDING_DIM).to(device)
+    # script_dir = os.path.dirname(os.path.abspath(__file__))
+    script_dir = Path(__file__).resolve().parent
+    print("real_script_dir: ", script_dir)
+    # checkpoint = torch.load(f'{script_dir}/modules/vlm_search/{model_path}/{MODEL_NAME}')
+    checkpoint = torch.load(f'{model_path}/{MODEL_NAME}')
+    policy_net.load_state_dict(checkpoint['policy_model'])
+    print('Model loaded!')
+    # print(next(policy_net.parameters()).device)
+    # Init Taxabind here (only need to init once)
+    if TAXABIND_TTA:
+        # self.clip_seg_tta = None
+        clip_seg_tta = ClipSegTTA(
+            img_dir=TAXABIND_IMG_DIR,
+            imo_dir=TAXABIND_IMO_DIR,
+            json_path=TAXABIND_INAT_JSON_PATH,
+            sat_to_img_ids_json_path=TAXABIND_SAT_TO_IMG_IDS_JSON_PATH,
+            patch_size=TAXABIND_PATCH_SIZE,
+            sat_checkpoint_path=TAXABIND_SAT_CHECKPOINT_PATH,
+            sample_index = 0,   # Set using 'reset' in worker
+            blur_kernel = TAXABIND_GAUSSIAN_BLUR_KERNEL,
+            device=device,
+            sat_to_img_ids_json_is_train_dict=False, # for search ds val
+            tax_to_filter_val=QUERY_TAX,
+            load_model=USE_CLIP_PREDS,
+            initial_modality=INITIAL_MODALITY,
+            sound_data_path = TAXABIND_SOUND_DATA_PATH,
+            sound_checkpoint_path=TAXABIND_SOUND_CHECKPOINT_PATH,
+            # sat_filtered_json_path=TAXABIND_FILTERED_INAT_JSON_PATH,
+        )
+        print("ClipSegTTA Loaded!")
+    else:
+        clip_seg_tta = None
+    # Define TestWorker
+    planner = TestWorker(
+        meta_agent_id=0,
+        n_agent=1,
+        policy_net=policy_net,
+        global_step=3,
+        device='cuda',
+        greedy=True,
+        save_image=SAVE_GIFS,
+        clip_seg_tta=clip_seg_tta
+    )
+    planner.run_episode(currEpisode)
+    print("planner.perf_metrics: ", planner.perf_metrics)

test_parameter.py ADDED Viewed

	@@ -0,0 +1,183 @@

+############################################################################################
+# Name: test_parameter.py
+#
+# NOTE: Change all your hyper-params here!
+# Simple How-To Guide:
+# 1. CLIP TTA: USE_CLIP_PREDS = True, EXECUTE_TTA = True
+# 2. CLIP (No TTA): USE_CLIP_PREDS = True, EXECUTE_TTA = False
+# 3. Custom masks (e.g. LLMSeg): USE_CLIP_PREDS = False, EXECUTE_TTA = False
+############################################################################################
+import os
+import sys
+sys.modules['TRAINING'] = False           # False = Inference Testing
+###############################################################
+OPT_VARS = {}
+def getenv(var_name, default=None, cast_type=str):
+    try:
+        value = os.environ.get(var_name, None)
+        if value is None:
+            result = default
+        elif cast_type == bool:
+            result = value.lower() in ("true", "1", "yes")
+        else:
+            result = cast_type(value)
+    except (ValueError, TypeError):
+        result = default
+    OPT_VARS[var_name] = result  # Log the result
+    return result
+###############################################################
+POLICY = getenv("POLICY", default="RL", cast_type=str)
+# TAX_HIERARCHY_TO_CONDENSE = 3  # Remove N layers of the taxonomy hierarchy from the back
+NUM_TEST = 800  # Overriden if TAXABIND_TTA is True and performing search ds val
+NUM_RUN = 1
+SAVE_GIFS = getenv("SAVE_GIFS", default=True, cast_type=bool)  # do you want to save GIFs
+SAVE_TRAJECTORY = False  # do you want to save per-step metrics
+SAVE_LENGTH = False  # do you want to save per-episode metrics
+VIZ_GRAPH_EDGES = False  # do you want to visualize the graph edges
+# MODEL_NAME = "pure_coverage_no_pose_obs_230325_stage1.pth"  # checkpoint.pth
+# MODEL_NAME = "STAGE2_20k_vlm_search_24x24_290225_NO_TARGET_REWARDS_600steps.pth" # checkpoint.pth
+# MODEL_NAME = "vlm_search_24x24_230225_NO_TARGET_REWARDS_600steps.pth" # checkpoint.pth
+# MODEL_NAME = "vlm_search_20x20_200125_256steps_CORRECT_REWARDS.pth" # checkpoint.pth
+MODEL_NAME = "STAGE1_vlm_search_24x24_040425_no_tgt_rewards_iNAT_DS_16k.pth"
+NUM_EPS_STEPS = getenv("NUM_EPS_STEPS", default=384, cast_type=int)
+TERMINATE_ON_TGTS_FOUND = False  # Whether to terminate episode when all targets found
+FORCE_LOGGING_DONE_TGTS_FOUND = True  # Whether to force csv logging when all targets found
+FIX_START_POSITION = getenv("FIX_START_POSITION", default=True, cast_type=bool)  # Whether to fix the starting position of the robots (middle index)
+## Whether to override initial score mask from CLIP
+USE_CLIP_PREDS = getenv("USE_CLIP_PREDS", default=True, cast_type=bool)     # If false, use custom masks from OVERRIDE_MASK_DIR
+OVERRIDE_MASK_DIR = getenv("OVERRIDE_MASK_DIR", default="/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/FINAL_COUNTS_COMBINED_top4000_pos_top55000_neg/Baselines_Masks/LLaVA_GroundedSAM/v1_010425/LLaVA_GroundedSAM_out_mask_val_in/out_mask_val_in", cast_type=str)
+# OVERRIDE_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/score_maps/v1_200325/pred/Animalia_Chordata_Mammalia_Rodentia"
+# OVERRIDE_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/score_maps/v1_200325/pred/Animalia_Chordata_Mammalia_Artiodactyla"
+# OVERRIDE_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/score_maps/v1_200325/pred/Animalia_Arthropoda_Arachnida_Araneae"
+# OVERRIDE_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/score_maps/v1_200325/pred/Plantae_Tracheophyta_Magnoliopsida_Caryophyllales"
+# OVERRIDE_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/FINAL_COUNTS_COMBINED_top4000_pos_top55000_neg/Baselines_Masks/LLaVA_GroundedSAM/v1_010425/LLaVA_GroundedSAM_out_mask_val_in/out_mask_val_in"
+# OVERRIDE_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/FINAL_COUNTS_COMBINED_top4000_pos_top55000_neg/Baselines_Masks/LLaVA_GroundedSAM/v1_010425/LLaVA_GroundedSAM_out_mask_val_out/out_mask_val_out"
+# Used to calcultae info_gain metric
+OVERRIDE_GT_MASK_DIR = getenv("OVERRIDE_GT_MASK_DIR", default="", cast_type=str)
+# OVERRIDE_GT_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/score_maps/v1_200325/gt/val_in_4gsnet_score_map"
+# OVERRIDE_GT_MASK_DIR = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/score_maps/v1_200325/gt/val_out_4gsnet_score_map"
+#######################################################################
+# iNAT TTA
+#######################################################################
+# Query Params
+QUERY_TAX = getenv("QUERY_TAX", default="", cast_type=str)  # "" = Test all tax
+# QUERY_TAX = "Animalia Chordata Mammalia Rodentia"     # search_val_in
+# QUERY_TAX = "Animalia Chordata Mammalia Artiodactyla"     # search_val_in
+# QUERY_TAX = "Animalia Arthropoda Arachnida Araneae"     # search_val_out
+# QUERY_TAX = "Plantae Tracheophyta Magnoliopsida Caryophyllales"     # search_val_out
+# TTA PARAMS
+EXECUTE_TTA = getenv("EXECUTE_TTA", default=True, cast_type=bool)  # Whether to execute TTA mask updates
+STEPS_PER_TTA = 20  # no. steps before each TTA series
+NUM_TTA_STEPS = 1   # no. of TTA steps during each series
+INITIAL_MODALITY = getenv("INITIAL_MODALITY", default="image", cast_type=str)  # "image", "text", "combined"
+MODALITY = getenv("MODALITY", default="image", cast_type=str)  # "image", "text", "combined"
+QUERY_VARIETY = getenv("QUERY_VARIETY", default=False, cast_type=bool)  # "image", "text", "combined"
+RESET_WEIGHTS = True
+MIN_LR = 1e-6
+MAX_LR = 1e-5   # 1e-5
+GAMMA_EXPONENT = 2  # 2
+# Paths related to taxabind (TRAIN w/ TARGETS)
+TAXABIND_TTA = True     # Whether to init TTA classes - FOR NOW: Always True
+TAXABIND_IMG_DIR = '/mnt/hdd/inat2021_ds/inat21'
+TAXABIND_IMO_DIR = '/mnt/hdd/inat2021_ds/sat_train_jpg_512px'    # sat_test_jpg_256px, sat_test_jpg_512px
+TAXABIND_INAT_JSON_PATH = '/mnt/hdd/inat2021_ds/inat21_train.json' # no filter needed
+TAXABIND_FILTERED_INAT_JSON_PATH = '/mnt/hdd/inat2021_ds/2_OTHERS/inat21_filtered_pixel_clip_train.json' # no filter needed
+# TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = '/mnt/hdd/inat2021_ds/filtered_mapping_sat_to_img_ids_val.json'
+TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = getenv("TAXABIND_SAT_TO_IMG_IDS_JSON_PATH", default="/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/FINAL_COUNTS_COMBINED_top4000_pos_top55000_neg/search_val_in.json", cast_type=str)
+# TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/search_val_in.json"
+# TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = "/mnt/hdd/inat2021_ds/target_search_ds/NEW_3-20_Targets/COUNTS_COMBINED_top2000_pos_top22000_neg/search_val_out.json"
+TAXABIND_PATCH_SIZE=14
+TAXABIND_SAT_CHECKPOINT_PATH="/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px_search_ds_filtered/pixel_clip_512px_search_ds_100625_CLIP-L-336_FINAL_SPLIT_LARGE_BUGFIX_CLIP_TRAIN_CORRECT_VAL_IN_TAX_FILTER_TGT_ONLY/satbind-epoch=02-val_loss=2.50_BACKUP.ckpt" # "/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px_search_ds_filtered/pixel_clip_512px_search_ds_070425_CLIP-L-336_FINAL_SPLIT_LARGE/satbind-epoch=02-val_loss=2.48-BACKUP.ckpt"
+TAXABIND_GAUSSIAN_BLUR_KERNEL = (5,5)
+TAXABIND_SAMPLE_INDEX = 8   # DEBUG (Starting point) 5, 6, 8
+# Sound
+TAXABIND_SOUND_DATA_PATH = '/mnt/hdd/inat2021_ds/2_OTHERS/sound_test'
+TAXABIND_SOUND_CHECKPOINT_PATH = "/home/user/Taxabind/TaxaBind/SoundBind/checkpoints/BUGFIX_CLIP_TRAIN_CORRECT_without_out_domain_taxs_v4_220625/soundbind-epoch=19-val_loss=3.92_BACKUP.ckpt"
+# # Paths related to taxabind (TRAIN w/ TARGETS)
+# TAXABIND_TTA = True
+# TAXABIND_IMG_DIR = '/mnt/hdd/inat2021_ds/inat21'
+# TAXABIND_IMO_DIR = '/mnt/hdd/inat2021_ds/sat_train_jpg_512px'    # sat_test_jpg_256px, sat_test_jpg_512px
+# TAXABIND_INAT_JSON_PATH = '/mnt/hdd/inat2021_ds/inat21_train.json' # no filter needed
+# TAXABIND_FILTERED_INAT_JSON_PATH = '/mnt/hdd/inat2021_ds/2_OTHERS/inat21_filtered_pixel_clip_train.json' # no filter needed
+# # TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = '/mnt/hdd/inat2021_ds/filtered_mapping_sat_to_img_ids_val.json'
+# TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = "/mnt/hdd/inat2021_ds/target_search_ds/OLD/taxon_sat_target_search_100x_per_10-20counts.json"
+# TAXABIND_PATCH_SIZE=14
+# TAXABIND_SAT_CHECKPOINT_PATH="/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_190225_CLIP-L-336/satbind-epoch=02-val_loss=2.40.ckpt" # /home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_160225_NO_DATASET_SHUFFLE/satbind-epoch=02-val_loss=2.26-BACKUP.ckpt
+# TAXABIND_SAMPLE_INDEX = 99   # (Starting point) 99,141
+# # Paths related to taxabind (VAL)
+# TAXABIND_TTA = True
+# TAXABIND_IMG_DIR = '/mnt/hdd/inat2021_ds/inat21'
+# TAXABIND_IMO_DIR = '/mnt/hdd/inat2021_ds/sat_test_jpg_512px'    # sat_test_jpg_256px, sat_test_jpg_512px
+# TAXABIND_INAT_JSON_PATH = '/mnt/hdd/inat2021_ds/inat21_val.json' # no filter needed
+# TAXABIND_FILTERED_INAT_JSON_PATH = '/mnt/hdd/inat2021_ds/inat21_filtered_pixel_clip_val.json' # no filter needed
+# # TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = '/mnt/hdd/inat2021_ds/filtered_mapping_sat_to_img_ids_val.json'
+# TAXABIND_SAT_TO_IMG_IDS_JSON_PATH = "/mnt/hdd/inat2021_ds/target_search_ds/taxon_sat_target_search_100x_per_10-20counts.json"
+# TAXABIND_PATCH_SIZE=14
+# TAXABIND_SAT_CHECKPOINT_PATH="/home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_190225_CLIP-L-336/satbind-epoch=02-val_loss=2.40.ckpt" # /home/user/Taxabind/TaxaBind/SatBind/checkpoints/BACKUP/512px/pixel_clip_512px_160225_NO_DATASET_SHUFFLE/satbind-epoch=02-val_loss=2.26-BACKUP.ckpt
+# TAXABIND_SAMPLE_INDEX = 45   # TEMP
+#######################################################################
+# Pretraining
+#######################################################################
+# TODO: Get rid of the LISA stuff...
+# If LISA trained clss
+GRIDMAP_SET_DIR = "Maps/flair_real_maps/envs_val_trained_clss"
+MASK_SET_DIR = "Maps/flair_real_maps_lisa_pred/finetuned_LISA_v3_original_losses/flair_lisa_soft_masks_trained_clss_v3"     # original_LISA, finetuned_LISA_v3_original_losses
+TARGETS_SET_DIR = "Maps/flair_real_maps/masks_val_trained_clss"  # If empty, then targets assumed to be on MASK_SET_DIR
+RAW_IMG_PATH_DICT = "Maps/flair_real_maps/flair-ds-paths-filtered-with-scores-val-trained-clss.csv"  # flair-ds-paths-filtered-with-scores-train.csv, flair-ds-paths-filtered-with-scores-val-trained-clss.csv, flair-ds-paths-filtered-with-scores-val-out-clss.csv
+# # If LISA out clss
+# GRIDMAP_SET_DIR = "Maps/flair_real_maps/envs_val_out_clss"
+# MASK_SET_DIR = "Maps/flair_real_maps_lisa_pred/finetuned_LISA_v3_original_losses/flair_lisa_soft_masks_out_clss_v3"    # original_LISA, finetuned_LISA_v3_original_losses
+# TARGETS_SET_DIR = "Maps/flair_real_maps/masks_val_out_clss"  # If empty, then targets assumed to be on MASK_SET_DIR
+# RAW_IMG_PATH_DICT = "Maps/flair_real_maps/flair-ds-paths-filtered-with-scores-val-out-clss.csv"  # flair-ds-paths-filtered-with-scores-train.csv, flair-ds-paths-filtered-with-scores-val-trained-clss.csv, flair-ds-paths-filtered-with-scores-val-out-clss.csv
+#######################################################################
+NUM_ROBOTS = 1
+NUM_COORDS_WIDTH=24           # How many node coords across width?
+NUM_COORDS_HEIGHT=24          # How many node coords across height?
+HIGH_INFO_REWARD_RATIO = 0.75   # Ratio of rewards for moving to uncertain area (high info vs low info)
+SENSOR_RANGE=80              # Only applicable to 'circle' sensor model
+SENSOR_MODEL="rectangular"   # "rectangular", "circle" (NOTE: (no colllision check for rectangular)
+INPUT_DIM = 4
+EMBEDDING_DIM = 128
+K_SIZE = 8   # 8
+USE_GPU = True  # do you want to use GPUS?
+NUM_GPU = getenv("NUM_GPU", default=2, cast_type=int) # the number of GPUs
+NUM_META_AGENT = getenv("NUM_META_AGENT", default=4, cast_type=int)  # the number of processes
+FOLDER_NAME = 'inference'
+model_path = f'{FOLDER_NAME}/model'
+gifs_path = f'{FOLDER_NAME}/test_results/gifs'
+trajectory_path = f'{FOLDER_NAME}/test_results/trajectory'
+length_path = f'{FOLDER_NAME}/test_results/length'
+log_path = f'{FOLDER_NAME}/test_results/log'
+CSV_EXPT_NAME = getenv("CSV_EXPT_NAME", default="data", cast_type=str)
+# trajectory_path = f'results/trajectory'
+# length_path = f'results/length'
+# COLORS (for printing)
+RED='\033[1;31m'
+GREEN='\033[1;32m'
+YELLOW='\033[1;93m'
+NC_BOLD='\033[1m' # Bold, No Color
+NC='\033[0m' # No Color