Update model to latest code (#6)

- Update return types (101e5336b8ef0f7d11a9377bde632bc50c9a8b91)
- Remove unused transform (aa4749c4e268dd893a359c201ea8b7a401a3eebd)
- Sync model code with repo code (0dc766bbae9eb38e1bdaf2b97a1615b54386abf1)
- Remove unused code (f3261e5b29172da8b5c983f869b3c894dd65d516)
- Update README (e8c0181afd4a9db194a21a9bbd3b49e0197b58f3)

Files changed (6) hide show

README.md +11 -12
config.json +1 -0
configuration_cased.py +8 -3
modeling_cased.py +99 -199
retrieval_cased.py +278 -0
transforms_cased.py +22 -82

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
 pipeline_tag: image-classification
 tags:
-- vision
 inference: false
 widget:
-- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
-  example_title: Cat & Dog
 ---
 # Category Search from External Databases (CaSED)
 Disclaimer: The model card is taken and modified from the official repository, which can be found [here](https://github.com/altndrr/vic). The paper can be found [here](https://arxiv.org/abs/2306.00917).
@@ -34,11 +35,11 @@ processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 # get the model outputs
 images = processor(images=[image], return_tensors="pt", padding=True)
-outputs = model(images, alpha=0.5)
 labels, scores = outputs["vocabularies"][0], outputs["scores"][0]
 # print the top 5 most likely labels for the image
-values, indices = scores.topk(5)
 print("\nTop predictions:\n")
 for value, index in zip(values, indices):
     print(f"{labels[index]:>16s}: {100 * value.item():.2f}%")
@@ -47,18 +48,16 @@ for value, index in zip(values, indices):
 The model depends on some libraries you have to install manually before execution:
 ```bash
-pip install torch faiss-cpu flair inflect nltk transformers
 ```
 ## Citation
 ```latex
-@misc{conti2023vocabularyfree,
-      title={Vocabulary-free Image Classification},
       author={Alessandro Conti and Enrico Fini and Massimiliano Mancini and Paolo Rota and Yiming Wang and Elisa Ricci},
       year={2023},
-      eprint={2306.00917},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV}
 }
-```

 ---
 pipeline_tag: image-classification
 tags:
+  - vision
 inference: false
 widget:
+  - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
+    example_title: Cat & Dog
 ---
 # Category Search from External Databases (CaSED)
 Disclaimer: The model card is taken and modified from the official repository, which can be found [here](https://github.com/altndrr/vic). The paper can be found [here](https://arxiv.org/abs/2306.00917).
 # get the model outputs
 images = processor(images=[image], return_tensors="pt", padding=True)
+outputs = model(images, alpha=0.7)
 labels, scores = outputs["vocabularies"][0], outputs["scores"][0]
 # print the top 5 most likely labels for the image
+values, indices = scores.sort(dim=-1, descending=True)
 print("\nTop predictions:\n")
 for value, index in zip(values, indices):
     print(f"{labels[index]:>16s}: {100 * value.item():.2f}%")
 The model depends on some libraries you have to install manually before execution:
 ```bash
+pip install torch faiss-cpu flair inflect nltk pyarrow transformers
 ```
 ## Citation
 ```latex
+@article{conti2023vocabularyfree,
+      title={Vocabulary-free Image Classification},
       author={Alessandro Conti and Enrico Fini and Massimiliano Mancini and Paolo Rota and Yiming Wang and Elisa Ricci},
       year={2023},
+      journal={NeurIPS},
 }
+```

config.json CHANGED Viewed

@@ -7,6 +7,7 @@
     "AutoConfig": "configuration_cased.CaSEDConfig",
     "AutoModel": "modeling_cased.CaSEDModel"
   },
   "index_name": "cc12m",
   "model_type": "cased",
   "retrieval_num_results": 10,

     "AutoConfig": "configuration_cased.CaSEDConfig",
     "AutoModel": "modeling_cased.CaSEDModel"
   },
+  "cache_dir": "~/.cache/cased",
   "index_name": "cc12m",
   "model_type": "cased",
   "retrieval_num_results": 10,

configuration_cased.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from transformers.modeling_utils import PretrainedConfig
@@ -5,9 +7,10 @@ class CaSEDConfig(PretrainedConfig):
     """Configuration class for CaSED.
     Args:
-        index_name (str, optional): Name of the index. Defaults to "cc12m".
-        alpha (float, optional): Weight of the vision loss. Defaults to 0.5.
-        retrieval_num_results (int, optional): Number of results to return. Defaults to 10.
     """
     model_type = "cased"
@@ -18,9 +21,11 @@ class CaSEDConfig(PretrainedConfig):
         index_name: str = "cc12m",
         alpha: float = 0.5,
         retrieval_num_results: int = 10,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.index_name = index_name
         self.alpha = alpha
         self.retrieval_num_results = retrieval_num_results

+import os
 from transformers.modeling_utils import PretrainedConfig
     """Configuration class for CaSED.
     Args:
+        index_name (str): Name of the index. Defaults to "cc12m".
+        alpha (float): Weight of the vision loss. Defaults to 0.5.
+        retrieval_num_results (int): Number of results to return. Defaults to 10.
+        cache_dir (str): Path to cache directory. Defaults to "~/.cache/cased".
     """
     model_type = "cased"
         index_name: str = "cc12m",
         alpha: float = 0.5,
         retrieval_num_results: int = 10,
+        cache_dir: str = os.path.expanduser("~/.cache/cased"),
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.index_name = index_name
         self.alpha = alpha
         self.retrieval_num_results = retrieval_num_results
+        self.cache_dir = cache_dir

modeling_cased.py CHANGED Viewed

@@ -1,66 +1,21 @@
 import os
-import tarfile
-from pathlib import Path
-from typing import Optional
-import faiss
 import numpy as np
-import pyarrow as pa
-import requests
 import torch
-from tqdm import tqdm
 from transformers import CLIPModel, CLIPProcessor
 from transformers.modeling_utils import PreTrainedModel
 from .configuration_cased import CaSEDConfig
 from .transforms_cased import default_vocabulary_transforms
-DATABASES = {
-    "cc12m": {
-        "url": "https://storage-cased.alessandroconti.me/cc12m.tar.gz",
-        "cache_subdir": "./cc12m/vit-l-14/",
-    },
-}
-class MetadataProvider:
-    """Metadata provider.
-    It uses arrow files to store metadata and retrieve it efficiently.
-    Code reference:
-        - https://github.dev/rom1504/clip-retrieval
-    """
-    def __init__(self, arrow_folder: Path):
-        arrow_files = [str(a) for a in sorted(arrow_folder.glob("**/*")) if a.is_file()]
-        self.table = pa.concat_tables(
-            [
-                pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, "r")).read_all()
-                for arrow_file in arrow_files
-            ]
-        )
-    def get(self, ids: np.ndarray, cols: Optional[list] = None):
-        """Get arrow metadata from ids.
-        Args:
-            ids (np.ndarray): Ids to retrieve.
-            cols (Optional[list], optional): Columns to retrieve. Defaults to None.
-        """
-        if cols is None:
-            cols = self.table.schema.names
-        else:
-            cols = list(set(self.table.schema.names) & set(cols))
-        t = pa.concat_tables([self.table[i:j] for i, j in zip(ids, ids + 1)])
-        return t.select(cols).to_pandas().to_dict("records")
 class CaSEDModel(PreTrainedModel):
     """Transformers module for Category Search from External Databases (CaSED).
     Reference:
-        - Conti et al. Vocabulary-free Image Classification. arXiv 2023.
     Args:
         config (CaSEDConfig): Configuration class for CaSED.
@@ -80,125 +35,60 @@ class CaSEDModel(PreTrainedModel):
         self.logit_scale = model.logit_scale.exp()
         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-        # load transforms
-        self.vocabulary_transforms = default_vocabulary_transforms()
         # set hparams
         self.hparams = {}
         self.hparams["alpha"] = config.alpha
         self.hparams["index_name"] = config.index_name
         self.hparams["retrieval_num_results"] = config.retrieval_num_results
-        # set cache dir
-        self.hparams["cache_dir"] = Path(os.path.expanduser("~/.cache/cased"))
         os.makedirs(self.hparams["cache_dir"], exist_ok=True)
-        # download databases
-        self.prepare_data()
-        # load faiss indices and metadata providers
-        self.resources = {}
-        for name, items in DATABASES.items():
-            database_path = self.hparams["cache_dir"] / "databases" / items["cache_subdir"]
-            text_index_fp = database_path / "text.index"
-            metadata_fp = database_path / "metadata/"
-            text_index = faiss.read_index(
-                str(text_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
-            )
-            metadata_provider = MetadataProvider(metadata_fp)
-            self.resources[name] = {
-                "device": self.device,
-                "model": "ViT-L-14",
-                "text_index": text_index,
-                "metadata_provider": metadata_provider,
-            }
-    def prepare_data(self):
-        """Download data if needed."""
-        databases_path = Path(self.hparams["cache_dir"]) / "databases"
-        for name, items in DATABASES.items():
-            url = items["url"]
-            database_path = Path(databases_path, name)
-            if database_path.exists():
-                continue
-            # download data
-            target_path = Path(databases_path, name + ".tar.gz")
-            os.makedirs(target_path.parent, exist_ok=True)
-            with requests.get(url, stream=True) as r:
-                r.raise_for_status()
-                total_bytes_size = int(r.headers.get('content-length', 0))
-                chunk_size = 8192
-                p_bar = tqdm(
-                    desc="Downloading cc12m index",
-                    total=total_bytes_size,
-                    unit='iB',
-                    unit_scale=True,
-                )
-                with open(target_path, 'wb') as f:
-                    for chunk in r.iter_content(chunk_size=chunk_size):
-                        f.write(chunk)
-                        p_bar.update(len(chunk))
-                p_bar.close()
-            # extract data
-            tar = tarfile.open(target_path, "r:gz")
-            tar.extractall(target_path.parent)
-            tar.close()
-            target_path.unlink()
-    @torch.no_grad()
-    def query_index(self, sample_z: torch.Tensor) -> torch.Tensor:
-        """Query the external database index.
         Args:
-            sample_z (torch.Tensor): Sample to query the index.
         """
-        # get the index
-        resources = self.resources[self.hparams["index_name"]]
-        text_index = resources["text_index"]
-        metadata_provider = resources["metadata_provider"]
-        # query the index
-        sample_z = sample_z.squeeze(0)
-        sample_z = sample_z / sample_z.norm(dim=-1, keepdim=True)
-        query_input = sample_z.cpu().detach().numpy().tolist()
-        query = np.expand_dims(np.array(query_input).astype("float32"), 0)
-        distances, idxs, _ = text_index.search_and_reconstruct(
-            query, self.hparams["retrieval_num_results"]
-        )
-        results = idxs[0]
-        nb_results = np.where(results == -1)[0]
-        nb_results = nb_results[0] if len(nb_results) > 0 else len(results)
-        indices = results[:nb_results]
-        distances = distances[0][:nb_results]
-        if len(distances) == 0:
-            return []
-        # get the metadata
-        results = []
-        metadata = metadata_provider.get(indices[:20], ["caption"])
-        for key, (d, i) in enumerate(zip(distances, indices)):
-            output = {}
-            meta = None if key + 1 > len(metadata) else metadata[key]
-            if meta is not None:
-                output.update(meta)
-            output["id"] = i.item()
-            output["similarity"] = d.item()
-            results.append(output)
-        # get the captions only
-        vocabularies = [result["caption"] for result in results]
         return vocabularies
-    @torch.no_grad()
-    def forward(self, images: dict, alpha: Optional[float] = None) -> torch.Tensor():
         """Forward pass.
         Args:
@@ -206,52 +96,62 @@ class CaSEDModel(PreTrainedModel):
                 - pixel_values (torch.Tensor): Pixel values of the images.
             alpha (Optional[float]): Alpha value for the interpolation.
         """
         # forward the images
         images["pixel_values"] = images["pixel_values"].to(self.device)
         images_z = self.vision_proj(self.vision_encoder(**images)[1])
-        vocabularies, samples_p = [], []
-        for image_z in images_z:
-            image_z = image_z.unsqueeze(0)
-            # generate a single text embedding from the unfiltered vocabulary
-            vocabulary = self.query_index(image_z)
-            text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
-            text["input_ids"] = text["input_ids"][:, :77].to(self.device)
-            text["attention_mask"] = text["attention_mask"][:, :77].to(self.device)
-            text_z = self.language_encoder(**text)[1]
-            text_z = self.language_proj(text_z)
-            text_z = text_z / text_z.norm(dim=-1, keepdim=True)
-            text_z = text_z.mean(dim=0).unsqueeze(0)
-            text_z = text_z / text_z.norm(dim=-1, keepdim=True)
-            # filter the vocabulary, embed it, and get its mean embedding
-            vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
-            text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
-            text = {k: v.to(self.device) for k, v in text.items()}
-            vocabulary_z = self.language_encoder(**text)[1]
-            vocabulary_z = self.language_proj(vocabulary_z)
-            vocabulary_z = vocabulary_z / vocabulary_z.norm(dim=-1, keepdim=True)
-            # get the image and text predictions
-            image_z = image_z / image_z.norm(dim=-1, keepdim=True)
-            text_z = text_z / text_z.norm(dim=-1, keepdim=True)
-            image_p = (self.logit_scale * image_z @ vocabulary_z.T).softmax(dim=-1)
-            text_p = (self.logit_scale * text_z @ vocabulary_z.T).softmax(dim=-1)
-            # average the image and text predictions
-            alpha = alpha or self.hparams["alpha"]
-            sample_p = alpha * image_p + (1 - alpha) * text_p
-            # save the results
-            samples_p.append(sample_p)
-            vocabularies.append(vocabulary)
-        # get the scores
-        samples_p = torch.stack(samples_p, dim=0)
-        scores = sample_p.cpu()
-        # define the results
-        results = {"vocabularies": vocabularies, "scores": scores}
-        return results

 import os
+from typing import Callable, Optional
 import numpy as np
 import torch
 from transformers import CLIPModel, CLIPProcessor
 from transformers.modeling_utils import PreTrainedModel
 from .configuration_cased import CaSEDConfig
+from .retrieval_cased import RetrievalDatabase, download_retrieval_databases
 from .transforms_cased import default_vocabulary_transforms
 class CaSEDModel(PreTrainedModel):
     """Transformers module for Category Search from External Databases (CaSED).
     Reference:
+        - Conti et al. Vocabulary-free Image Classification. NeurIPS 2023.
     Args:
         config (CaSEDConfig): Configuration class for CaSED.
         self.logit_scale = model.logit_scale.exp()
         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
         # set hparams
         self.hparams = {}
         self.hparams["alpha"] = config.alpha
         self.hparams["index_name"] = config.index_name
         self.hparams["retrieval_num_results"] = config.retrieval_num_results
+        self.hparams["cache_dir"] = config.cache_dir
+        # create cache dir
         os.makedirs(self.hparams["cache_dir"], exist_ok=True)
+        # download data
+        download_retrieval_databases(cache_dir=self.hparams["cache_dir"])
+        # setup vocabulary
+        self.vocabulary = RetrievalDatabase("cc12m", self.hparams["cache_dir"])
+        self._vocab_transform = default_vocabulary_transforms()
+    @property
+    def vocab_transform(self) -> Callable:
+        """Get image preprocess transform.
+        The getter wraps the transform in a map_reduce function and applies it to a list of images.
+        If interested in the transform itself, use `self._vocab_transform`.
+        """
+        vocab_transform = self._vocab_transform
+        def vocabs_transforms(texts: list[str]) -> list[torch.Tensor]:
+            return [vocab_transform(text) for text in texts]
+        return vocabs_transforms
+    def get_vocabulary(self, images_z: Optional[torch.Tensor] = None) -> list[list[str]]:
+        """Get the vocabulary for a batch of images.
         Args:
+            images_z (torch.Tensor): Batch of image embeddings.
         """
+        num_samples = self.hparams["retrieval_num_results"]
+        assert images_z is not None
+        images_z = images_z / images_z.norm(dim=-1, keepdim=True)
+        images_z = images_z.cpu().detach().numpy().tolist()
+        if isinstance(images_z[0], float):
+            images_z = [images_z]
+        query = np.matrix(images_z).astype("float32")
+        results = self.vocabulary.query(query, modality="text", num_samples=num_samples)
+        vocabularies = [[r["caption"] for r in result] for result in results]
         return vocabularies
+    def forward(self, images: dict, alpha: Optional[float] = None) -> torch.Tensor:
         """Forward pass.
         Args:
                 - pixel_values (torch.Tensor): Pixel values of the images.
             alpha (Optional[float]): Alpha value for the interpolation.
         """
+        alpha = alpha or self.hparams["alpha"]
         # forward the images
         images["pixel_values"] = images["pixel_values"].to(self.device)
         images_z = self.vision_proj(self.vision_encoder(**images)[1])
+        images_z = images_z / images_z.norm(dim=-1, keepdim=True)
+        vocabularies = self.get_vocabulary(images_z=images_z)
+        # encode unfiltered words
+        unfiltered_words = sum(vocabularies, [])
+        texts_z = self.processor(unfiltered_words, return_tensors="pt", padding=True)
+        texts_z["input_ids"] = texts_z["input_ids"][:, :77].to(self.device)
+        texts_z["attention_mask"] = texts_z["attention_mask"][:, :77].to(self.device)
+        texts_z = self.language_encoder(**texts_z)[1]
+        texts_z = self.language_proj(texts_z)
+        texts_z = texts_z / texts_z.norm(dim=-1, keepdim=True)
+        # generate a text embedding for each image from their unfiltered words
+        unfiltered_words_per_image = [len(vocab) for vocab in vocabularies]
+        texts_z = torch.split(texts_z, unfiltered_words_per_image)
+        texts_z = torch.stack([text_z.mean(dim=0) for text_z in texts_z])
+        texts_z = texts_z / texts_z.norm(dim=-1, keepdim=True)
+        # filter the words and embed them
+        vocabularies = self.vocab_transform(vocabularies)
+        vocabularies = [vocab or ["object"] for vocab in vocabularies]
+        words = sum(vocabularies, [])
+        words_z = self.processor(words, return_tensors="pt", padding=True)
+        words_z = {k: v.to(self.device) for k, v in words_z.items()}
+        words_z = self.language_encoder(**words_z)[1]
+        words_z = self.language_proj(words_z)
+        words_z = words_z / words_z.norm(dim=-1, keepdim=True)
+        # create a one-hot relation mask between images and words
+        words_per_image = [len(vocab) for vocab in vocabularies]
+        col_indices = torch.arange(sum(words_per_image))
+        row_indices = torch.arange(len(images_z)).repeat_interleave(torch.tensor(words_per_image))
+        mask = torch.zeros(len(images_z), sum(words_per_image), device=self.device)
+        mask[row_indices, col_indices] = 1
+        # get the image and text similarities
+        images_z = images_z / images_z.norm(dim=-1, keepdim=True)
+        texts_z = texts_z / texts_z.norm(dim=-1, keepdim=True)
+        words_z = words_z / words_z.norm(dim=-1, keepdim=True)
+        images_sim = self.logit_scale * images_z @ words_z.T
+        texts_sim = self.logit_scale * texts_z @ words_z.T
+        # mask unrelated words
+        images_sim = torch.masked_fill(images_sim, mask == 0, float("-inf"))
+        texts_sim = torch.masked_fill(texts_sim, mask == 0, float("-inf"))
+        # get the image and text predictions
+        images_p = images_sim.softmax(dim=-1)
+        texts_p = texts_sim.softmax(dim=-1)
+        # average the image and text predictions
+        samples_p = alpha * images_p + (1 - alpha) * texts_p
+        return {"scores": samples_p, "words": words, "vocabularies": vocabularies}

retrieval_cased.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import tarfile
+from collections import defaultdict
+from pathlib import Path
+import faiss
+import numpy as np
+import pyarrow as pa
+import requests
+from tqdm import tqdm
+__all__ = ["RetrievalDatabase", "download_retrieval_databases"]
+RETRIEVAL_DATABASES_URLS = {
+    "cc12m": {
+        "url": "https://storage-cased.alessandroconti.me/cc12m.tar.gz",
+        "cache_subdir": "./cc12m/vit-l-14/",
+    },
+}
+def download_retrieval_databases(cache_dir: str = "~/.cache/cased"):
+    """Download data if needed.
+    Args:
+        cache_dir (str): Path to cache directory. Defaults to "~/.cache/cased".
+    """
+    databases_path = Path(cache_dir, "databases")
+    for name, items in RETRIEVAL_DATABASES_URLS.items():
+        url = items["url"]
+        database_path = Path(databases_path, name)
+        if database_path.exists():
+            continue
+        # download data
+        target_path = Path(databases_path, name + ".tar.gz")
+        target_path.parent.mkdir(parents=True, exist_ok=True)
+        with requests.get(url, stream=True) as r:
+            r.raise_for_status()
+            total_bytes_size = int(r.headers.get("content-length", 0))
+            chunk_size = 8192
+            p_bar = tqdm(
+                desc="Downloading cc12m index",
+                total=total_bytes_size,
+                unit="iB",
+                unit_scale=True,
+            )
+            with open(target_path, "wb") as f:
+                for chunk in r.iter_content(chunk_size=chunk_size):
+                    f.write(chunk)
+                    p_bar.update(len(chunk))
+            p_bar.close()
+        # extract data
+        tar = tarfile.open(target_path, "r:gz")
+        tar.extractall(target_path.parent)
+        tar.close()
+        target_path.unlink()
+class RetrievalDatabaseMetadataProvider:
+    """Metadata provider for the retrieval database.
+    Args:
+        metadata_dir (str): Path to the metadata directory.
+    """
+    def __init__(self, metadata_dir: str):
+        metadatas = [str(a) for a in sorted(Path(metadata_dir).glob("**/*")) if a.is_file()]
+        self.table = pa.concat_tables(
+            [
+                pa.ipc.RecordBatchFileReader(pa.memory_map(metadata, "r")).read_all()
+                for metadata in metadatas
+            ]
+        )
+    def get(self, ids):
+        """Get the metadata for the given ids.
+        Args:
+            ids (list): List of ids.
+        """
+        columns = self.table.schema.names
+        end_ids = [i + 1 for i in ids]
+        t = pa.concat_tables([self.table[start:end] for start, end in zip(ids, end_ids)])
+        return t.select(columns).to_pandas().to_dict("records")
+class RetrievalDatabase:
+    """Retrieval database.
+    Args:
+        database_name (str): Name of the database.
+        cache_dir (str): Path to cache directory. Defaults to "~/.cache/cased".
+    """
+    def __init__(self, database_name: str, cache_dir: str = "~/.cache/cased"):
+        assert database_name in RETRIEVAL_DATABASES_URLS.keys(), (
+            f"Database name should be one of "
+            f"{list(RETRIEVAL_DATABASES_URLS.keys())}, got {database_name}."
+        )
+        database_dir = Path(cache_dir) / "databases"
+        database_dir = database_dir / RETRIEVAL_DATABASES_URLS[database_name]["cache_subdir"]
+        self._database_dir = database_dir
+        image_index_fp = Path(database_dir) / "image.index"
+        text_index_fp = Path(database_dir) / "text.index"
+        image_index = (
+            faiss.read_index(str(image_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
+            if image_index_fp.exists()
+            else None
+        )
+        text_index = (
+            faiss.read_index(str(text_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
+            if text_index_fp.exists()
+            else None
+        )
+        metadata_dir = str(Path(database_dir) / "metadata")
+        metadata_provider = RetrievalDatabaseMetadataProvider(metadata_dir)
+        self._image_index = image_index
+        self._text_index = text_index
+        self._metadata_provider = metadata_provider
+    def _map_to_metadata(self, indices: list, distances: list, embs: list, num_images: int):
+        """Map the indices to metadata.
+        Args:
+            indices (list): List of indices.
+            distances (list): List of distances.
+            embs (list): List of results embeddings.
+            num_images (int): Number of images.
+        """
+        results = []
+        metas = self._metadata_provider.get(indices[:num_images])
+        for key, (d, i, emb) in enumerate(zip(distances, indices, embs)):
+            output = {}
+            meta = None if key + 1 > len(metas) else metas[key]
+            if meta is not None:
+                output.update(self._meta_to_dict(meta))
+            output["id"] = i.item()
+            output["similarity"] = d.item()
+            output["sample_z"] = emb.tolist()
+            results.append(output)
+        return results
+    def _meta_to_dict(self, metadata):
+        """Convert metadata to dict.
+        Args:
+            metadata (dict): Metadata.
+        """
+        output = {}
+        for k, v in metadata.items():
+            if isinstance(v, bytes):
+                v = v.decode()
+            elif type(v).__module__ == np.__name__:
+                v = v.item()
+            output[k] = v
+        return output
+    def _get_connected_components(self, neighbors):
+        """Find connected components in a graph.
+        Args:
+            neighbors (dict): Dictionary of neighbors.
+        """
+        seen = set()
+        def component(node):
+            r = []
+            nodes = {node}
+            while nodes:
+                node = nodes.pop()
+                seen.add(node)
+                nodes |= set(neighbors[node]) - seen
+                r.append(node)
+            return r
+        u = []
+        for node in neighbors:
+            if node not in seen:
+                u.append(component(node))
+        return u
+    def _deduplicate_embeddings(self, embeddings, threshold=0.94):
+        """Deduplicate embeddings.
+        Args:
+            embeddings (np.matrix): Embeddings to deduplicate.
+            threshold (float): Threshold to use for deduplication. Default is 0.94.
+        """
+        index = faiss.IndexFlatIP(embeddings.shape[1])
+        index.add(embeddings)
+        l, _, indices = index.range_search(embeddings, threshold)
+        same_mapping = defaultdict(list)
+        for i in range(embeddings.shape[0]):
+            start = l[i]
+            end = l[i + 1]
+            for j in indices[start:end]:
+                same_mapping[int(i)].append(int(j))
+        groups = self._get_connected_components(same_mapping)
+        non_uniques = set()
+        for g in groups:
+            for e in g[1:]:
+                non_uniques.add(e)
+        return set(list(non_uniques))
+    def query(
+        self, query: np.matrix, modality: str = "text", num_samples: int = 10
+    ) -> list[list[dict]]:
+        """Query the database.
+        Args:
+            query (np.matrix): Query to search.
+            modality (str): Modality to search. One of `image` or `text`. Default to `text`.
+            num_samples (int): Number of samples to return. Default is 40.
+        """
+        index = self._image_index if modality == "image" else self._text_index
+        distances, indices, embeddings = index.search_and_reconstruct(query, num_samples)
+        results = [indices[i] for i in range(len(indices))]
+        nb_results = [np.where(r == -1)[0] for r in results]
+        total_distances = []
+        total_indices = []
+        total_embeddings = []
+        for i in range(len(results)):
+            num_res = nb_results[i][0] if len(nb_results[i]) > 0 else len(results[i])
+            result_indices = results[i][:num_res]
+            result_distances = distances[i][:num_res]
+            result_embeddings = embeddings[i][:num_res]
+            # normalise embeddings
+            l2 = np.atleast_1d(np.linalg.norm(result_embeddings, 2, -1))
+            l2[l2 == 0] = 1
+            result_embeddings = result_embeddings / np.expand_dims(l2, -1)
+            # deduplicate embeddings
+            local_indices_to_remove = self._deduplicate_embeddings(result_embeddings)
+            indices_to_remove = set()
+            for local_index in local_indices_to_remove:
+                indices_to_remove.add(result_indices[local_index])
+            curr_indices = []
+            curr_distances = []
+            curr_embeddings = []
+            for ind, dis, emb in zip(result_indices, result_distances, result_embeddings):
+                if ind not in indices_to_remove:
+                    indices_to_remove.add(ind)
+                    curr_indices.append(ind)
+                    curr_distances.append(dis)
+                    curr_embeddings.append(emb)
+            total_indices.append(curr_indices)
+            total_distances.append(curr_distances)
+            total_embeddings.append(curr_embeddings)
+        if len(total_distances) == 0:
+            return []
+        total_results = []
+        for i in range(len(total_distances)):
+            results = self._map_to_metadata(
+                total_indices[i], total_distances[i], total_embeddings[i], num_samples
+            )
+            total_results.append(results)
+        return total_results

transforms_cased.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import re
 from abc import ABC, abstractmethod
-from typing import Any, Union
 import inflect
 import nltk
@@ -17,7 +17,6 @@ __all__ = [
     "DropWords",
     "FilterPOS",
     "FrequencyMinWordCount",
-    "FrequencyTopK",
     "ReplaceSeparators",
     "ToLowercase",
     "ToSingular",
@@ -28,7 +27,7 @@ class BaseTextTransform(ABC):
     """Base class for string transforms."""
     @abstractmethod
-    def __call__(self, text: str):
         raise NotImplementedError
     def __repr__(self) -> str:
@@ -38,7 +37,7 @@ class BaseTextTransform(ABC):
 class DropFileExtensions(BaseTextTransform):
     """Remove file extensions from the input text."""
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove file extensions from.
@@ -51,7 +50,7 @@ class DropFileExtensions(BaseTextTransform):
 class DropNonAlpha(BaseTextTransform):
     """Remove non-alpha words from the input text."""
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove non-alpha words from.
@@ -72,7 +71,7 @@ class DropShortWords(BaseTextTransform):
         super().__init__()
         self.min_length = min_length
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove short words from.
@@ -92,7 +91,7 @@ class DropSpecialCharacters(BaseTextTransform):
     hyphen, period, apostrophe, or ampersand.
     """
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove special characters from.
@@ -108,7 +107,7 @@ class DropTokens(BaseTextTransform):
     Tokens are defined as strings enclosed in angle brackets, e.g. <token>.
     """
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove tokens from.
@@ -121,7 +120,7 @@ class DropTokens(BaseTextTransform):
 class DropURLs(BaseTextTransform):
     """Remove URLs from the input text."""
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove URLs from.
@@ -142,7 +141,7 @@ class DropWords(BaseTextTransform):
         self.words = words
         self.pattern = r"\b(?:{})\b".format("|".join(words))
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove words from.
@@ -161,14 +160,12 @@ class FilterPOS(BaseTextTransform):
     Args:
         tags (list): List of POS tags to remove.
         engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
-        keep_compound_nouns (bool): Whether to keep composed words. Defaults to True.
     """
-    def __init__(self, tags: list, engine: str = "nltk", keep_compound_nouns: bool = True) -> None:
         super().__init__()
         self.tags = tags
         self.engine = engine
-        self.keep_compound_nouns = keep_compound_nouns
         if engine == "nltk":
             nltk.download("averaged_perceptron_tagger", quiet=True)
@@ -177,7 +174,7 @@ class FilterPOS(BaseTextTransform):
         elif engine == "flair":
             self.tagger = SequenceTagger.load("flair/pos-english-fast").predict
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove words with specific POS tags from.
@@ -190,30 +187,6 @@ class FilterPOS(BaseTextTransform):
             self.tagger(sentence)
             text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
-        if self.keep_compound_nouns:
-            compound_nouns = []
-            if self.engine == "nltk":
-                for i in range(len(word_tags) - 1):
-                    if word_tags[i][1] == "NN" and word_tags[i + 1][1] == "NN":
-                        # if they are the same word, skip
-                        if word_tags[i][0] == word_tags[i + 1][0]:
-                            continue
-                        compound_noun = word_tags[i][0] + "_" + word_tags[i + 1][0]
-                        compound_nouns.append(compound_noun)
-            elif self.engine == "flair":
-                for i in range(len(sentence.tokens) - 1):
-                    if sentence.tokens[i].tag == "NN" and sentence.tokens[i + 1].tag == "NN":
-                        # if they are the same word, skip
-                        if sentence.tokens[i].text == sentence.tokens[i + 1].text:
-                            continue
-                        compound_noun = sentence.tokens[i].text + "_" + sentence.tokens[i + 1].text
-                        compound_nouns.append(compound_noun)
-            text = " ".join([text, " ".join(compound_nouns)])
         return text
     def __repr__(self) -> str:
@@ -234,7 +207,7 @@ class FrequencyMinWordCount(BaseTextTransform):
         super().__init__()
         self.min_count = min_count
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove infrequent words from.
@@ -257,47 +230,10 @@ class FrequencyMinWordCount(BaseTextTransform):
         return f"{self.__class__.__name__}(min_count={self.min_count})"
-class FrequencyTopK(BaseTextTransform):
-    """Keep only the top k most frequent words in the input text.
-    In case of a tie, all words with the same count as the last word are kept.
-    Args:
-        top_k (int): Number of top words to keep.
-    """
-    def __init__(self, top_k: int) -> None:
-        super().__init__()
-        self.top_k = top_k
-    def __call__(self, text: str):
-        """
-        Args:
-            text (str): Text to remove infrequent words from.
-        """
-        if self.top_k < 1:
-            return text
-        words = text.split()
-        word_counts = {word: words.count(word) for word in words}
-        top_words = sorted(word_counts, key=word_counts.get, reverse=True)
-        # in case of a tie, keep all words with the same count
-        top_words = top_words[: self.top_k]
-        top_words = [word for word in top_words if word_counts[word] == word_counts[top_words[-1]]]
-        text = " ".join([word for word in words if word in top_words])
-        return text
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}(top_k={self.top_k})"
 class ReplaceSeparators(BaseTextTransform):
     """Replace underscores and dashes with spaces."""
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to replace separators in.
@@ -313,7 +249,7 @@ class ReplaceSeparators(BaseTextTransform):
 class RemoveDuplicates(BaseTextTransform):
     """Remove duplicate words from the input text."""
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to remove duplicate words from.
@@ -337,7 +273,11 @@ class TextCompose:
     def __init__(self, transforms: list[BaseTextTransform]) -> None:
         self.transforms = transforms
-    def __call__(self, text: Union[str, list[str]]) -> Any:
         if isinstance(text, list):
             text = " ".join(text)
@@ -357,7 +297,7 @@ class TextCompose:
 class ToLowercase(BaseTextTransform):
     """Convert text to lowercase."""
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to convert to lowercase.
@@ -374,7 +314,7 @@ class ToSingular(BaseTextTransform):
         super().__init__()
         self.transform = inflect.engine().singular_noun
-    def __call__(self, text: str):
         """
         Args:
             text (str): Text to convert to singular form.
@@ -430,7 +370,7 @@ def default_vocabulary_transforms() -> TextCompose:
     transforms.append(ToSingular())
     transforms.append(DropWords(words=words_to_drop))
     transforms.append(FrequencyMinWordCount(min_count=2))
-    transforms.append(FilterPOS(tags=pos_tags, engine="flair", keep_compound_nouns=False))
     transforms.append(RemoveDuplicates())
     transforms = TextCompose(transforms)

 import re
 from abc import ABC, abstractmethod
+from typing import Union
 import inflect
 import nltk
     "DropWords",
     "FilterPOS",
     "FrequencyMinWordCount",
     "ReplaceSeparators",
     "ToLowercase",
     "ToSingular",
     """Base class for string transforms."""
     @abstractmethod
+    def __call__(self, text: str) -> str:
         raise NotImplementedError
     def __repr__(self) -> str:
 class DropFileExtensions(BaseTextTransform):
     """Remove file extensions from the input text."""
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove file extensions from.
 class DropNonAlpha(BaseTextTransform):
     """Remove non-alpha words from the input text."""
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove non-alpha words from.
         super().__init__()
         self.min_length = min_length
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove short words from.
     hyphen, period, apostrophe, or ampersand.
     """
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove special characters from.
     Tokens are defined as strings enclosed in angle brackets, e.g. <token>.
     """
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove tokens from.
 class DropURLs(BaseTextTransform):
     """Remove URLs from the input text."""
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove URLs from.
         self.words = words
         self.pattern = r"\b(?:{})\b".format("|".join(words))
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove words from.
     Args:
         tags (list): List of POS tags to remove.
         engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
     """
+    def __init__(self, tags: list, engine: str = "nltk") -> None:
         super().__init__()
         self.tags = tags
         self.engine = engine
         if engine == "nltk":
             nltk.download("averaged_perceptron_tagger", quiet=True)
         elif engine == "flair":
             self.tagger = SequenceTagger.load("flair/pos-english-fast").predict
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove words with specific POS tags from.
             self.tagger(sentence)
             text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
         return text
     def __repr__(self) -> str:
         super().__init__()
         self.min_count = min_count
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove infrequent words from.
         return f"{self.__class__.__name__}(min_count={self.min_count})"
 class ReplaceSeparators(BaseTextTransform):
     """Replace underscores and dashes with spaces."""
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to replace separators in.
 class RemoveDuplicates(BaseTextTransform):
     """Remove duplicate words from the input text."""
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to remove duplicate words from.
     def __init__(self, transforms: list[BaseTextTransform]) -> None:
         self.transforms = transforms
+    def __call__(self, text: Union[str, list[str]]) -> list[str]:
+        """
+        Args:
+            text (Union[str, list[str]]): Text to transform.
+        """
         if isinstance(text, list):
             text = " ".join(text)
 class ToLowercase(BaseTextTransform):
     """Convert text to lowercase."""
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to convert to lowercase.
         super().__init__()
         self.transform = inflect.engine().singular_noun
+    def __call__(self, text: str) -> str:
         """
         Args:
             text (str): Text to convert to singular form.
     transforms.append(ToSingular())
     transforms.append(DropWords(words=words_to_drop))
     transforms.append(FrequencyMinWordCount(min_count=2))
+    transforms.append(FilterPOS(tags=pos_tags, engine="flair"))
     transforms.append(RemoveDuplicates())
     transforms = TextCompose(transforms)