""" app.py Question / answer over a collection of PDF documents using late interaction ColBERT model for retrieval and DSPy+Mistral for answer generation. :author: Didier Guillevic :date: 2024-12-22 """ import gradio as gr import logging logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) import os import pdf_utils # utilities for pdf processing import colbert_utils # utilities for to build a ColBERT retrieval model import dspy_utils # utilities for building a DSPy based retrieval generation model from tqdm.notebook import tqdm import warnings warnings.filterwarnings('ignore') dspy_rag_model = None def build_rag_model(files: list[str]) -> str: """Build a retrieval augmented model using given files to index. """ global dspy_rag_model # Get the text from the pdf files documents = [] metadatas = [] for pdf_file in files: logger.info(f"Processing {pdf_file}") metadata = pdf_utils.get_metadata_info(pdf_file) text = pdf_utils.get_text_from_pdf(pdf_file) if text: documents.append(text) metadatas.append(metadata) # Build the ColBERT retrieval model colbert_base_model = 'antoinelouis/colbert-xm' # multilingual model colbert_index_name = 'OECD_HNW' # for web app, generate unique name with uuid.uuid4() retrieval_model = colbert_utils.build_colbert_model( documents, metadatas, pretrained_model=colbert_base_model, index_name=colbert_index_name ) # Instanatiate the DSPy based RAG model dspy_rag_model = dspy_utils.DSPyRagModel(retrieval_model) return "Done building RAG model." def generate_response(question: str) -> list[str, str, str]: """Generate a response to a given question using the RAG model. """ global dspy_rag_model if dspy_rag_model is None: return "RAG model not built. Please build the model first.", [], [] # Generate response responses, references, snippets = dspy_rag_model.generate_response( question=question, k=5, method='chain_of_thought') return responses, references, snippets def on_load_build(): pdf_files = ["OECD_Dividend_tax_fraud_2023-en.pdf",] return build_rag_model(pdf_files) with gr.Blocks() as demo: gr.Markdown(""" # Retrieval (ColBERT) + Generation (DSPy & Mistral) - Note: - building the retrieval model will be slow on **free CPU** (expect 5+ minutes). - first question & answer will be slow (2 minutes for model loading). Subsequent question approx. 10 seconds. - Usage: upload a few PDF files to index. Build the model. Ask questions. """) # Input files and build status with gr.Row(): upload_files = gr.File( label="Upload PDF files to index", file_count="multiple", value=["OECD_Dividend_tax_fraud_2023-en.pdf",], scale=5) build_status = gr.Textbox(label="Build status", placeholder="", scale=2) # button build_button = gr.Button("Build retrieval generation model", variant='primary') # Question to answer question = gr.Textbox( label="Question", placeholder="" ) response = gr.Textbox( label="Response", placeholder="" ) with gr.Accordion("References & snippets", open=False): references = gr.HTML(label="References") snippets = gr.HTML(label="Snippets") # button with gr.Row(): response_button = gr.Button("Submit", variant='primary') clear_question_button = gr.Button("Clear", variant='secondary') # Example questions given default provided PDF file with gr.Accordion("Sample questions", open=False): gr.Examples( [ ["What is dividend stripping?",], ["What are the most common types of dividend stripping schemes?",], ["How do authorities detect dividend stripping?",], ["What are some indicators of potential dividend stripping?",], ["What are the consequences of dividend stripping?",], ["How can countries combat dividend stripping?",], ["What is the role of professional enablers in dividend stripping?",], ["How can countries address the role of professional enablers in dividend stripping?",], ], inputs=[question,], outputs=[response, references, snippets], fn=generate_response, cache_examples=False, label="Sample questions" ) # Documentation with gr.Accordion("Documentation", open=False): gr.Markdown(""" - What - Retrieval augmented generation (RAG) model based on ColBERT and DSPy. - Retrieval base model: 'antoinelouis/colbert-xm' (multilingual model) - Generation framework: DSPy and Mistral. - How - Upload PDF files to index. - Build the retrieval generation model (might take a few minutes) - Ask a question about the content of those uploaded documents. """) # Click actions build_button.click( fn=build_rag_model, inputs=[upload_files], outputs=[build_status] ) response_button.click( fn=generate_response, inputs=[question], outputs=[response, references, snippets] ) clear_question_button.click( fn=lambda: ('', '', '', ''), inputs=[], outputs=[question, response, references, snippets] ) # Upon loading, index a default PDF file demo.load(on_load_build, outputs=[build_status]) demo.launch(show_api=False)