Spaces:

geekyrakshit
/

medrag

Runtime error

App Files Files Community

geekyrakshit commited on Oct 21, 2024

Commit

3b25ef5

1 Parent(s): 05b69a5

update: docs for colpali and nv-embed-v2

Browse files

Files changed (2) hide show

medrag_multi_modal/retrieval/colpali_retrieval.py +151 -68
medrag_multi_modal/retrieval/nv_embed_2.py +24 -0

medrag_multi_modal/retrieval/colpali_retrieval.py CHANGED Viewed

@@ -21,55 +21,6 @@ class CalPaliRetriever(weave.Model):
     It can be initialized with a pre-trained model or from a specified W&B artifact. The class
     also provides methods to index new data and to predict/retrieve documents based on a query.
-    !!! example "Indexing Data"
-        First you need to install `Byaldi` library by Answer.ai.
-        ```bash
-        uv pip install Byaldi>=0.0.5
-        ```
-        Next, you can index the data by running the following code:
-        ```python
-        import wandb
-        from medrag_multi_modal.retrieval import CalPaliRetriever
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="index")
-        retriever = CalPaliRetriever()
-        retriever.index(
-            data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
-            weave_dataset_name="grays-anatomy-images:v0",
-            index_name="grays-anatomy",
-        )
-        ```
-    !!! example "Retrieving Documents"
-        First you need to install `Byaldi` library by Answer.ai.
-        ```bash
-        uv pip install Byaldi>=0.0.5
-        ```
-        Next, you can retrieve the documents by running the following code:
-        ```python
-        import weave
-        import wandb
-        from medrag_multi_modal.retrieval import CalPaliRetriever
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        retriever = CalPaliRetriever.from_wandb_artifact(
-            index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
-            metadata_dataset_name="grays-anatomy-images:v0",
-            data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
-        )
-        retriever.predict(
-            query="which neurotransmitters convey information between Merkel cells and sensory afferents?",
-            top_k=3,
-        )
-        ```
     Attributes:
         model_name (str): The name of the model to be used for retrieval.
     """
@@ -98,28 +49,55 @@ class CalPaliRetriever(weave.Model):
             if metadata_dataset_name
             else None
         )
-    @classmethod
-    def from_wandb_artifact(
-        cls,
-        index_artifact_name: str,
-        metadata_dataset_name: str,
-        data_artifact_name: str,
-    ):
-        from byaldi import RAGMultiModalModel
-        index_artifact_dir = get_wandb_artifact(index_artifact_name, "colpali-index")
-        data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
-        docs_retrieval_model = RAGMultiModalModel.from_index(
-            index_path=os.path.join(index_artifact_dir, "index")
-        )
-        return cls(
-            docs_retrieval_model=docs_retrieval_model,
-            metadata_dataset_name=metadata_dataset_name,
-            data_artifact_dir=data_artifact_dir,
-        )
-    def index(self, data_artifact_name: str, weave_dataset_name: str, index_name: str):
         data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
         self._docs_retrieval_model.index(
             input_path=data_artifact_dir,
@@ -138,6 +116,76 @@ class CalPaliRetriever(weave.Model):
             )
             artifact.save()
     @weave.op()
     def predict(self, query: str, top_k: int = 3) -> list[dict[str, Any]]:
         """
@@ -147,6 +195,41 @@ class CalPaliRetriever(weave.Model):
         This function uses the document retrieval model to search for the most relevant
         documents based on the provided query. It returns a list of dictionaries, each
         containing the document image, document ID, and the relevance score.
         Args:
             query (str): The search query string.

     It can be initialized with a pre-trained model or from a specified W&B artifact. The class
     also provides methods to index new data and to predict/retrieve documents based on a query.
     Attributes:
         model_name (str): The name of the model to be used for retrieval.
     """
             if metadata_dataset_name
             else None
         )
+    def index(self, data_artifact_name: str, weave_dataset_name: str, index_name: str):
+        """
+        Indexes a dataset of documents and saves the index as a Weave artifact.
+        This method retrieves a dataset of documents from a Weave artifact using the provided
+        data artifact name. It then indexes the documents using the document retrieval model
+        and assigns the specified index name. The index is stored locally without storing the
+        collection with the index and overwrites any existing index with the same name.
+        If a Weave run is active, the method creates a new Weave artifact with the specified
+        index name and type "colpali-index". It adds the local index directory to the artifact
+        and saves it to Weave, including metadata with the provided Weave dataset name.
+        !!! example "Indexing Data"
+            First you need to install `Byaldi` library by Answer.ai.
+            ```bash
+            uv pip install Byaldi>=0.0.5
+            ```
+            Next, you can index the data by running the following code:
+            ```python
+            import wandb
+            from medrag_multi_modal.retrieval import CalPaliRetriever
+            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="index")
+            retriever = CalPaliRetriever()
+            retriever.index(
+                data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
+                weave_dataset_name="grays-anatomy-images:v0",
+                index_name="grays-anatomy",
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
+        Args:
+            data_artifact_name (str): The name of the Weave artifact containing the dataset.
+            weave_dataset_name (str): The name of the Weave dataset to include in the artifact metadata.
+            index_name (str): The name to assign to the created index.
+        """
         data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
         self._docs_retrieval_model.index(
             input_path=data_artifact_dir,
             )
             artifact.save()
+    @classmethod
+    def from_wandb_artifact(
+        cls,
+        index_artifact_name: str,
+        metadata_dataset_name: str,
+        data_artifact_name: str,
+    ):
+        """
+        Creates an instance of the class from Weights & Biases (wandb) artifacts.
+        This method retrieves the necessary artifacts from wandb to initialize the
+        ColPaliRetriever. It fetches the index artifact directory and the data artifact
+        directory using the provided artifact names. It then loads the document retrieval
+        model from the index path within the index artifact directory. Finally, it returns
+        an instance of the class initialized with the retrieved document retrieval model,
+        metadata dataset name, and data artifact directory.
+        !!! example "Retrieving Documents"
+            First you need to install `Byaldi` library by Answer.ai.
+            ```bash
+            uv pip install Byaldi>=0.0.5
+            ```
+            Next, you can retrieve the documents by running the following code:
+            ```python
+            import weave
+            import wandb
+            from medrag_multi_modal.retrieval import CalPaliRetriever
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = CalPaliRetriever.from_wandb_artifact(
+                index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
+                metadata_dataset_name="grays-anatomy-images:v0",
+                data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
+        Args:
+            index_artifact_name (str): The name of the wandb artifact containing the index.
+            metadata_dataset_name (str): The name of the dataset containing metadata.
+            data_artifact_name (str): The name of the wandb artifact containing the data.
+        Returns:
+            An instance of the class initialized with the retrieved document retrieval model,
+            metadata dataset name, and data artifact directory.
+        """
+        from byaldi import RAGMultiModalModel
+        index_artifact_dir = get_wandb_artifact(index_artifact_name, "colpali-index")
+        data_artifact_dir = get_wandb_artifact(data_artifact_name, "dataset")
+        docs_retrieval_model = RAGMultiModalModel.from_index(
+            index_path=os.path.join(index_artifact_dir, "index")
+        )
+        return cls(
+            docs_retrieval_model=docs_retrieval_model,
+            metadata_dataset_name=metadata_dataset_name,
+            data_artifact_dir=data_artifact_dir,
+        )
     @weave.op()
     def predict(self, query: str, top_k: int = 3) -> list[dict[str, Any]]:
         """
         This function uses the document retrieval model to search for the most relevant
         documents based on the provided query. It returns a list of dictionaries, each
         containing the document image, document ID, and the relevance score.
+        !!! example "Retrieving Documents"
+            First you need to install `Byaldi` library by Answer.ai.
+            ```bash
+            uv pip install Byaldi>=0.0.5
+            ```
+            Next, you can retrieve the documents by running the following code:
+            ```python
+            import weave
+            import wandb
+            from medrag_multi_modal.retrieval import CalPaliRetriever
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = CalPaliRetriever.from_wandb_artifact(
+                index_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy:v0",
+                metadata_dataset_name="grays-anatomy-images:v0",
+                data_artifact_name="ml-colabs/medrag-multi-modal/grays-anatomy-images:v1",
+            )
+            retriever.predict(
+                query="which neurotransmitters convey information between Merkel cells and sensory afferents?",
+                top_k=3,
+            )
+            ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for ColPali by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
         Args:
             query (str): The search query string.

medrag_multi_modal/retrieval/nv_embed_2.py CHANGED Viewed

@@ -83,6 +83,14 @@ class NVEmbed2Retriever(weave.Model):
                 index_name="grays-anatomy-nvembed2",
             )
             ```
         Args:
             chunk_dataset_name (str): The name of the Weave dataset containing the text chunks
@@ -136,6 +144,14 @@ class NVEmbed2Retriever(weave.Model):
                 index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
             )
             ```
         Args:
             chunk_dataset_name (str): The name of the Weave dataset containing the text chunks.
@@ -242,6 +258,14 @@ class NVEmbed2Retriever(weave.Model):
             )
             retriever.predict(query="What are Ribosomes?")
             ```
         Args:
             query (str): The input query string to search for relevant chunks.

                 index_name="grays-anatomy-nvembed2",
             )
             ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
         Args:
             chunk_dataset_name (str): The name of the Weave dataset containing the text chunks
                 index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
             )
             ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
         Args:
             chunk_dataset_name (str): The name of the Weave dataset containing the text chunks.
             )
             retriever.predict(query="What are Ribosomes?")
             ```
+        ??? note "Optional Speedup using Flash Attention"
+            If you have a GPU with Flash Attention support, you can enable it for NV-Embed-v2 by simply
+            installing the `flash-attn` package.
+            ```bash
+            uv pip install flash-attn --no-build-isolation
+            ```
         Args:
             query (str): The input query string to search for relevant chunks.