Spaces:

amitsinghchandel
/

Multimodal_RAG

Configuration error

App Files Files Community

amitsinghchandel commited on Jan 20

Commit

b69b215

1 Parent(s): fe70f68

Update space with new code

Browse files

Files changed (18) hide show

README.md +7 -11
__init__.py +0 -0
__pycache__/__init__.cpython-310.pyc +0 -0
__pycache__/colpali_manager.cpython-310.pyc +0 -0
__pycache__/middleware.cpython-310.pyc +0 -0
__pycache__/milvus_manager.cpython-310.pyc +0 -0
__pycache__/pdf_manager.cpython-310.pyc +0 -0
__pycache__/rag.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +130 -59
colpali_manager.py +79 -0
middleware.py +56 -0
milvus_manager.py +162 -0
packages.txt +1 -0
pdf_manager.py +42 -0
rag.py +104 -0
requirements.txt +9 -1
utils.py +5 -0

README.md CHANGED Viewed

@@ -1,12 +1,8 @@
----
-title: Multimodal RAG
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 5.0.1
-app_file: app.py
-pinned: false
----
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

+MultiModal Rag with Colpali and Milvus
+===
+Code for blog [https://saumitra.me/2024/2024-11-15-colpali-milvus-rag/](https://saumitra.me/2024/2024-11-15-colpali-milvus-rag/) on how to do multimodal RAG with [colpali](https://arxiv.org/abs/2407.01449), [milvus](https://milvus.io/) and a visual LLM (gemini/gpt-4o)
+Demo running at [https://huggingface.co/spaces/saumitras/colpali-milvus](https://huggingface.co/spaces/saumitras/colpali-milvus)
+Application will allow users to upload a PDF and then perform search or Q&A queries on both the text and visual elements of the document. We will not extract text from the PDF; instead, we will treat it as an image and use colpali to get embeddings for the PDF pages. These embeddings will be indexed to Milvus, and then we will use a visual LLM (gemini/gpt-4o) to facilitate the Q&A queries.

__init__.py ADDED Viewed

File without changes

__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (118 Bytes). View file

__pycache__/colpali_manager.cpython-310.pyc ADDED Viewed

Binary file (3.78 kB). View file

__pycache__/middleware.cpython-310.pyc ADDED Viewed

Binary file (2.11 kB). View file

__pycache__/milvus_manager.cpython-310.pyc ADDED Viewed

Binary file (5.27 kB). View file

__pycache__/pdf_manager.cpython-310.pyc ADDED Viewed

Binary file (1.57 kB). View file

__pycache__/rag.cpython-310.pyc ADDED Viewed

Binary file (1.28 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (371 Bytes). View file

app.py CHANGED Viewed

@@ -1,64 +1,135 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import tempfile
+import os
+import fitz  # PyMuPDF
+import uuid
+from FAPS.middleware import Middleware
+from FAPS.rag import Rag
+rag = Rag()
+def generate_uuid(state):
+    # Check if UUID already exists in session state
+    if state["user_uuid"] is None:
+        # Generate a new UUID if not already set
+        state["user_uuid"] = str(uuid.uuid4())
+    return state["user_uuid"]
+class PDFSearchApp:
+    def __init__(self):
+        self.indexed_docs = {}
+        self.current_pdf = None
+    def upload_and_convert(self, state, file, max_pages):
+        id = generate_uuid(state)
+        if file is None:
+            return "No file uploaded"
+        print(f"Uploading file: {file.name}, id: {id}")
+        try:
+            self.current_pdf = file.name
+            middleware = Middleware(id, create_collection=True)
+            pages = middleware.index(pdf_path=file.name, id=id, max_pages=max_pages)
+            self.indexed_docs[id] = True
+            return f"Uploaded and extracted {len(pages)} pages"
+        except Exception as e:
+            return f"Error processing PDF: {str(e)}"
+    def search_documents(self, state, query, num_results=1):
+        print(f"Searching for query: {query}")
+        id = generate_uuid(state)
+        if not self.indexed_docs[id]:
+            print("Please index documents first")
+            return "Please index documents first", "--"
+        if not query:
+            print("Please enter a search query")
+            return "Please enter a search query", "--"
+        try:
+            middleware = Middleware(id, create_collection=False)
+            search_results = middleware.search([query])[0]
+            page_num = search_results[0][1] + 1
+            print(f"Retrieved page number: {page_num}")
+            img_path = f"pages/{id}/page_{page_num}.png"
+            print(f"Retrieved image path: {img_path}")
+            rag_response = rag.get_answer_from_gemini(query, [img_path])
+            return img_path, rag_response
+        except Exception as e:
+            return f"Error during search: {str(e)}", "--"
+def create_ui():
+    app = PDFSearchApp()
+    with gr.Blocks() as demo:
+        state = gr.State(value={"user_uuid": None})
+        gr.Markdown("# Colpali Milvus Multimodal RAG Demo")
+        gr.Markdown("This demo showcases how to use [Colpali](https://github.com/illuin-tech/colpali) embeddings with [Milvus](https://milvus.io/) and utilizing Gemini/OpenAI multimodal RAG for pdf search and Q&A.")
+        with gr.Tab("Upload PDF"):
+            with gr.Column():
+                file_input = gr.File(label="Upload PDF")
+                max_pages_input = gr.Slider(
+                    minimum=1,
+                    maximum=50,
+                    value=20,
+                    step=10,
+                    label="Max pages to extract and index"
+                )
+                status = gr.Textbox(label="Indexing Status", interactive=False)
+        with gr.Tab("Query"):
+            with gr.Column():
+                query_input = gr.Textbox(label="Enter query")
+                # num_results = gr.Slider(
+                #     minimum=1,
+                #     maximum=10,
+                #     value=5,
+                #     step=1,
+                #     label="Number of results"
+                # )
+                search_btn = gr.Button("Query")
+                llm_answer = gr.Textbox(label="RAG Response", interactive=False)
+                images = gr.Image(label="Top page matching query")
+        # Event handlers
+        file_input.change(
+            fn=app.upload_and_convert,
+            inputs=[state, file_input, max_pages_input],
+            outputs=[status]
+        )
+        search_btn.click(
+            fn=app.search_documents,
+            inputs=[state, query_input],
+            outputs=[images, llm_answer]
+        )
+    return demo
 if __name__ == "__main__":
+    demo = create_ui()
     demo.launch()

colpali_manager.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from colpali_engine.models import ColPali
+from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor
+from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor
+from colpali_engine.utils.torch_utils import ListDataset
+from torch.utils.data import DataLoader
+import torch
+from typing import List, cast
+from tqdm import tqdm
+from PIL import Image
+# Ensure device is set to CPU for macOS
+device = torch.device("cpu")
+model_name = "vidore/colpali-v1.2"
+# Load the ColPali model and processor for CPU
+model = ColPali.from_pretrained(
+    model_name,
+    torch_dtype=torch.float32,  # Use float32 for CPU
+    device_map=None,  # No device map needed for CPU
+).to(device).eval()
+processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
+class ColpaliManager:
+    def __init__(self, device="cpu", model_name="vidore/colpali-v1.2"):
+        print(f"Initializing ColpaliManager with device {device} and model {model_name}")
+        self.device = torch.device(device)
+        self.model = ColPali.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,
+            device_map=None,
+        ).to(self.device).eval()
+        self.processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))
+    def get_images(self, paths: List[str]) -> List[Image.Image]:
+        return [Image.open(path) for path in paths]
+    def process_images(self, image_paths: List[str], batch_size=5) -> List[float]:
+        print(f"Processing {len(image_paths)} image_paths")
+        images = self.get_images(image_paths)
+        dataloader = DataLoader(
+            dataset=ListDataset[Image.Image](images),
+            batch_size=batch_size,
+            shuffle=False,
+            collate_fn=lambda x: self.processor.process_images(x),
+        )
+        embeddings = []
+        for batch_doc in tqdm(dataloader):
+            with torch.no_grad():
+                batch_doc = {k: v.to(self.device) for k, v in batch_doc.items()}
+                embeddings_batch = self.model(**batch_doc)
+            embeddings.extend(list(torch.unbind(embeddings_batch.to(self.device))))
+        return [embedding.float().cpu().numpy() for embedding in embeddings]
+    def process_text(self, texts: List[str], batch_size=1) -> List[float]:
+        print(f"Processing {len(texts)} texts")
+        dataloader = DataLoader(
+            dataset=ListDataset[str](texts),
+            batch_size=batch_size,
+            shuffle=False,
+            collate_fn=lambda x: self.processor.process_queries(x),
+        )
+        embeddings = []
+        for batch_query in dataloader:
+            with torch.no_grad():
+                batch_query = {k: v.to(self.device) for k, v in batch_query.items()}
+                embeddings_batch = self.model(**batch_query)
+            embeddings.extend(list(torch.unbind(embeddings_batch.to(self.device))))
+        return [embedding.float().cpu().numpy() for embedding in embeddings]

middleware.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from FAPS.colpali_manager import ColpaliManager
+from FAPS.milvus_manager import MilvusManager
+from FAPS.pdf_manager import PdfManager
+import hashlib
+pdf_manager = PdfManager()
+colpali_manager = ColpaliManager(device="cpu", model_name="vidore/colpali-v1.2")
+class Middleware:
+    def __init__(self, id:str, create_collection=True):
+        hashed_id = hashlib.md5(id.encode()).hexdigest()[:8]
+        milvus_db_name = f"milvus_{hashed_id}.db"
+        self.milvus_manager = MilvusManager(milvus_db_name, "colpali", create_collection)
+    def index(self, pdf_path: str, id:str, max_pages: int, pages: list[int] = None):
+        print(f"Indexing {pdf_path}, id: {id}, max_pages: {max_pages}")
+        image_paths = pdf_manager.save_images(id, pdf_path, max_pages)
+        print(f"Saved {len(image_paths)} images")
+        colbert_vecs = colpali_manager.process_images(image_paths)
+        images_data = [{
+            "colbert_vecs": colbert_vecs[i],
+            "filepath": image_paths[i]
+        } for i in range(len(image_paths))]
+        print(f"Inserting {len(images_data)} images data to Milvus")
+        self.milvus_manager.insert_images_data(images_data)
+        print("Indexing completed")
+        return image_paths
+    def search(self, search_queries: list[str]):
+        print(f"Searching for {len(search_queries)} queries")
+        final_res = []
+        for query in search_queries:
+            print(f"Searching for query: {query}")
+            query_vec = colpali_manager.process_text([query])[0]
+            search_res = self.milvus_manager.search(query_vec, topk=1)
+            print(f"Search result: {search_res} for query: {query}")
+            final_res.append(search_res)
+        return final_res

milvus_manager.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from pymilvus import MilvusClient, DataType
+import numpy as np
+import concurrent.futures
+class MilvusManager:
+    def __init__(self, milvus_uri, collection_name, create_collection, dim=128):
+        self.client = MilvusClient(uri=milvus_uri)
+        self.collection_name = collection_name
+        if self.client.has_collection(collection_name=self.collection_name):
+            self.client.load_collection(collection_name)
+        self.dim = dim
+        if create_collection:
+            self.create_collection()
+            self.create_index()
+    def create_collection(self):
+        if self.client.has_collection(collection_name=self.collection_name):
+            self.client.drop_collection(collection_name=self.collection_name)
+        schema = self.client.create_schema(
+            auto_id=True,
+            enable_dynamic_fields=True,
+        )
+        schema.add_field(field_name="pk", datatype=DataType.INT64, is_primary=True)
+        schema.add_field(
+            field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=self.dim
+        )
+        schema.add_field(field_name="seq_id", datatype=DataType.INT16)
+        schema.add_field(field_name="doc_id", datatype=DataType.INT64)
+        schema.add_field(field_name="doc", datatype=DataType.VARCHAR, max_length=65535)
+        self.client.create_collection(
+            collection_name=self.collection_name, schema=schema
+        )
+    def create_index(self):
+        self.client.release_collection(collection_name=self.collection_name)
+        self.client.drop_index(
+            collection_name=self.collection_name, index_name="vector"
+        )
+        index_params = self.client.prepare_index_params()
+        index_params.add_index(
+            field_name="vector",
+            index_name="vector_index",
+            index_type="HNSW",
+            metric_type="IP",
+            params={
+                "M": 16,
+                "efConstruction": 500,
+            },
+        )
+        self.client.create_index(
+            collection_name=self.collection_name, index_params=index_params, sync=True
+        )
+    def create_scalar_index(self):
+        self.client.release_collection(collection_name=self.collection_name)
+        index_params = self.client.prepare_index_params()
+        index_params.add_index(
+            field_name="doc_id",
+            index_name="int32_index",
+            index_type="INVERTED",
+        )
+        self.client.create_index(
+            collection_name=self.collection_name, index_params=index_params, sync=True
+        )
+    def search(self, data, topk):
+        search_params = {"metric_type": "IP", "params": {}}
+        results = self.client.search(
+            self.collection_name,
+            data,
+            limit=int(50),
+            output_fields=["vector", "seq_id", "doc_id"],
+            search_params=search_params,
+        )
+        doc_ids = set()
+        for r_id in range(len(results)):
+            for r in range(len(results[r_id])):
+                doc_ids.add(results[r_id][r]["entity"]["doc_id"])
+        scores = []
+        def rerank_single_doc(doc_id, data, client, collection_name):
+            doc_colbert_vecs = client.query(
+                collection_name=collection_name,
+                filter=f"doc_id in [{doc_id}, {doc_id + 1}]",
+                output_fields=["seq_id", "vector", "doc"],
+                limit=1000,
+            )
+            doc_vecs = np.vstack(
+                [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
+            )
+            score = np.dot(data, doc_vecs.T).max(1).sum()
+            return (score, doc_id)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
+            futures = {
+                executor.submit(
+                    rerank_single_doc, doc_id, data, self.client, self.collection_name
+                ): doc_id
+                for doc_id in doc_ids
+            }
+            for future in concurrent.futures.as_completed(futures):
+                score, doc_id = future.result()
+                scores.append((score, doc_id))
+        scores.sort(key=lambda x: x[0], reverse=True)
+        if len(scores) >= topk:
+            return scores[:topk]
+        else:
+            return scores
+    def insert(self, data):
+        colbert_vecs = [vec for vec in data["colbert_vecs"]]
+        seq_length = len(colbert_vecs)
+        doc_ids = [data["doc_id"] for i in range(seq_length)]
+        seq_ids = list(range(seq_length))
+        docs = [""] * seq_length
+        docs[0] = data["filepath"]
+        self.client.insert(
+            self.collection_name,
+            [
+                {
+                    "vector": colbert_vecs[i],
+                    "seq_id": seq_ids[i],
+                    "doc_id": doc_ids[i],
+                    "doc": docs[i],
+                }
+                for i in range(seq_length)
+            ],
+        )
+    def get_images_as_doc(self, images_with_vectors:list):
+        images_data = []
+        for i in range(len(images_with_vectors)):
+            data = {
+                "colbert_vecs": images_with_vectors[i]["colbert_vecs"],
+                "doc_id": i,
+                "filepath": images_with_vectors[i]["filepath"],
+            }
+            images_data.append(data)
+        return images_data
+    def insert_images_data(self, image_data):
+        data = self.get_images_as_doc(image_data)
+        for i in range(len(data)):
+            self.insert(data[i])

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ poppler-utils

pdf_manager.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from pdf2image import convert_from_path
+import os
+import shutil
+class PdfManager:
+    def __init__(self):
+        pass
+    def clear_and_recreate_dir(self, output_folder):
+        print(f"Clearing output folder {output_folder}")
+        if os.path.exists(output_folder):
+            shutil.rmtree(output_folder)
+        os.makedirs(output_folder)
+    def save_images(self, id, pdf_path, max_pages, pages: list[int] = None) -> list[str]:
+        output_folder = f"pages/{id}/"
+        images = convert_from_path(pdf_path)
+        print(f"Saving images from {pdf_path} to {output_folder}. Max pages: {max_pages}")
+        self.clear_and_recreate_dir(output_folder)
+        num_page_processed = 0
+        for i, image in enumerate(images):
+            if max_pages and num_page_processed >= max_pages:
+                break
+            if pages and i not in pages:
+                continue
+            full_save_path = f"{output_folder}/page_{i + 1}.png"
+            #print(f"Saving image to {full_save_path}")
+            image.save(full_save_path, "PNG")
+            num_page_processed += 1
+        return [f"{output_folder}/page_{i + 1}.png" for i in range(num_page_processed)]

rag.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import requests
+import os
+import google.generativeai as genai
+from typing import List
+from FAPS.utils import encode_image
+from PIL import Image
+class Rag:
+    def get_answer_from_gemini(self, query, imagePaths):
+        print(f"Querying Gemini for query={query}, imagePaths={imagePaths}")
+        try:
+            genai.configure(api_key=os.environ['GEMINI_API_KEY'])
+            model = genai.GenerativeModel('gemini-1.5-flash')
+            images = [Image.open(path) for path in imagePaths]
+            chat = model.start_chat()
+            response = chat.send_message([*images, query])
+            answer = response.text
+            print(answer)
+            return answer
+        except Exception as e:
+            print(f"An error occurred while querying Gemini: {e}")
+            return f"Error: {str(e)}"
+    # def get_answer_from_openai(self, query, imagesPaths):
+    #     print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}")
+    #     try:
+    #         payload = self.__get_openai_api_payload(query, imagesPaths)
+    #         headers = {
+    #             "Content-Type": "application/json",
+    #             "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
+    #         }
+    #         response = requests.post(
+    #             url="https://api.openai.com/v1/chat/completions",
+    #             headers=headers,
+    #             json=payload
+    #         )
+    #         response.raise_for_status()  # Raise an HTTPError for bad responses
+    #         answer = response.json()["choices"][0]["message"]["content"]
+    #         print(answer)
+    #         return answer
+    #     except Exception as e:
+    #         print(f"An error occurred while querying OpenAI: {e}")
+    #         return None
+    # def __get_openai_api_payload(self, query:str, imagesPaths:List[str]):
+    #     image_payload = []
+    #     for imagePath in imagesPaths:
+    #         base64_image = encode_image(imagePath)
+    #         image_payload.append({
+    #             "type": "image_url",
+    #             "image_url": {
+    #                 "url": f"data:image/jpeg;base64,{base64_image}"
+    #             }
+    #         })
+    #     payload = {
+    #         "model": "gpt-4o",
+    #         "messages": [
+    #             {
+    #                 "role": "user",
+    #                 "content": [
+    #                     {
+    #                         "type": "text",
+    #                         "text": query
+    #                     },
+    #                     *image_payload
+    #                 ]
+    #             }
+    #         ],
+    #         "max_tokens": 1024
+    #     }
+    #     return payload
+# if __name__ == "__main__":
+#     rag = Rag()
+#     query = "Based on attached images, how many new cases were reported during second wave peak"
+#     imagesPaths = ["covid_slides_page_8.png", "covid_slides_page_8.png"]
+#     rag.get_answer_from_gemini(query, imagesPaths)

requirements.txt CHANGED Viewed

	@@ -1 +1,9 @@
1	- ~~huggingface_hub~~==0.25.2

+gradio==4.25.0
+PyMuPDF==1.24.9
+pdf2image==1.17.0
+pymilvus==2.4.9
+colpali_engine==0.3.4
+tqdm==4.66.5
+pillow==10.4.0
+spaces==0.30.4
+google-generativeai==0.8.3

utils.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import base64
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')