Spaces:

PercivalFletcher
/

ChadGPT

Sleeping

App Files Files Community

PercivalFletcher commited on Aug 8

Commit

a19a241

verified ·

1 Parent(s): 17aa266

Upload 6 files

Browse files

Files changed (6) hide show

chunking.py +68 -0
document_processor.py +88 -0
embedding.py +40 -0
generation.py +57 -0
main.py +149 -0
retrieval.py +139 -0

chunking.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# file: chunking.py
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from typing import List
+from unstructured.partition.md import partition_md
+from unstructured.documents.elements import Header, Footer, PageBreak, Table, NarrativeText
+# --- Configuration ---
+CHUNK_SIZE = 1000
+CHUNK_OVERLAP = 200
+def process_and_chunk(raw_text: str) -> List[Document]:
+    """
+    Partitions raw text from a document processor using 'unstructured',
+    correctly interpreting it as markdown to preserve table structures,
+    and then chunks the remaining text content.
+    Args:
+        raw_text: The raw string content of the document (expected to be markdown).
+    Returns:
+        A list of Document objects, including structured tables and chunked text.
+    """
+    if not raw_text:
+        print("Warning: Input text for chunking is empty.")
+        return []
+    print(f"Processing raw text of length {len(raw_text)} with 'unstructured' markdown parser.")
+    # --- FIX: Change content_type to "text/markdown" ---
+    # This tells unstructured to use its specialized markdown parser, which
+    # correctly handles tables and other structures from your PyMuPDF output.
+    elements = partition_md(text=raw_text)
+    documents = []
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    for element in elements:
+        if isinstance(element, (Header, Footer, PageBreak)):
+            continue
+        # Process tables
+        if "unstructured.documents.elements.Table" in str(type(element)):
+            table_html = element.metadata.text_as_html
+            table_metadata = element.metadata.to_dict()
+            table_metadata['content_type'] = 'table'
+            documents.append(Document(page_content=table_html, metadata=table_metadata))
+        # Process and chunk narrative text
+        elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
+            chunks = text_splitter.split_text(element.text)
+            for chunk in chunks:
+                chunk_metadata = element.metadata.to_dict()
+                chunk_metadata['content_type'] = 'text'
+                documents.append(Document(page_content=chunk, metadata=chunk_metadata))
+        # Handle other elements directly
+        else:
+            general_metadata = element.metadata.to_dict()
+            general_metadata['content_type'] = 'other'
+            documents.append(Document(page_content=element.text, metadata=general_metadata))
+    print(f"Created {len(documents)} documents (chunks and tables).")
+    return documents

document_processor.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# file: document_processing.py
+import os
+import time
+import httpx
+from pathlib import Path
+from urllib.parse import urlparse, unquote
+from llama_index.readers.file import PyMuPDFReader
+from llama_index.core import Document as LlamaDocument
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pydantic import HttpUrl
+from typing import List
+# Define the batch size for parallel processing
+BATCH_SIZE = 25
+def _process_page_batch(documents_batch: List[LlamaDocument]) -> str:
+    """
+    Helper function to extract content from a batch of LlamaIndex Document objects
+    and join them into a single string.
+    """
+    return "\n\n".join([d.get_content() for d in documents_batch])
+async def ingest_and_parse_document(doc_url: HttpUrl) -> str:
+    """
+    Asynchronously downloads a document, saves it locally, and parses it to
+    Markdown text using PyMuPDFReader with parallel processing.
+    Args:
+        doc_url: The Pydantic-validated URL of the document to process.
+    Returns:
+        A single string containing the document's extracted text.
+    """
+    print(f"Initiating download from: {doc_url}")
+    LOCAL_STORAGE_DIR = "data/"
+    os.makedirs(LOCAL_STORAGE_DIR, exist_ok=True)
+    try:
+        # Asynchronously download the document
+        async with httpx.AsyncClient() as client:
+            response = await client.get(str(doc_url), timeout=30.0, follow_redirects=True)
+            response.raise_for_status()
+            doc_bytes = response.content
+        print("Download successful.")
+        # Determine a valid local filename
+        parsed_path = urlparse(str(doc_url)).path
+        filename = unquote(os.path.basename(parsed_path)) or "downloaded_document.pdf"
+        local_file_path = Path(os.path.join(LOCAL_STORAGE_DIR, filename))
+        # Save the document locally
+        with open(local_file_path, "wb") as f:
+            f.write(doc_bytes)
+        print(f"Document saved locally at: {local_file_path}")
+        # Parse the document using LlamaIndex's PyMuPDFReader
+        print("Parsing document with PyMuPDFReader...")
+        loader = PyMuPDFReader()
+        docs_from_loader = loader.load_data(file_path=local_file_path)
+        # Parallelize the extraction of text from loaded pages
+        start_time = time.perf_counter()
+        all_extracted_texts = []
+        with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as executor:
+            futures = [
+                executor.submit(_process_page_batch, docs_from_loader[i:i + BATCH_SIZE])
+                for i in range(0, len(docs_from_loader), BATCH_SIZE)
+            ]
+            for future in as_completed(futures):
+                all_extracted_texts.append(future.result())
+        doc_text = "\n\n".join(all_extracted_texts)
+        elapsed_time = time.perf_counter() - start_time
+        print(f"Time taken for parallel text extraction: {elapsed_time:.4f} seconds.")
+        if not doc_text:
+            raise ValueError("Document parsing yielded no content.")
+        print(f"Parsing complete. Extracted {len(doc_text)} characters.")
+        return doc_text
+    except httpx.HTTPStatusError as e:
+        print(f"Error downloading document: {e}")
+        raise
+    except Exception as e:
+        print(f"An unexpected error occurred during document processing: {e}")
+        raise

embedding.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# file: embedding.py
+import torch
+from sentence_transformers import SentenceTransformer
+from typing import List
+# --- Configuration ---
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
+class EmbeddingClient:
+    """A client for generating text embeddings using a local sentence transformer model."""
+    def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = SentenceTransformer(model_name, device=self.device)
+        print(f"EmbeddingClient initialized with model '{model_name}' on device '{self.device}'.")
+    def create_embeddings(self, texts: List[str]) -> torch.Tensor:
+        """
+        Generates embeddings for a list of text chunks.
+        Args:
+            texts: A list of strings to be embedded.
+        Returns:
+            A torch.Tensor containing the generated embeddings.
+        """
+        if not texts:
+            return torch.tensor([])
+        print(f"Generating embeddings for {len(texts)} text chunks on {self.device}...")
+        try:
+            embeddings = self.model.encode(
+                texts, convert_to_tensor=True, show_progress_bar=False
+            )
+            print("Embeddings generated successfully.")
+            return embeddings
+        except Exception as e:
+            print(f"An error occurred during embedding generation: {e}")
+            raise

generation.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# file: generation.py
+from groq import AsyncGroq
+from typing import List, Dict
+# --- Configuration ---
+GROQ_MODEL_NAME = "llama3-8b-8192"
+async def generate_answer(query: str, context_chunks: List[Dict], groq_api_key: str) -> str:
+    """
+    Generates a final answer using the Groq API based on the query and retrieved context.
+    Args:
+        query: The user's original question.
+        context_chunks: A list of the most relevant, reranked document chunks.
+        groq_api_key: The API key for the Groq service.
+    Returns:
+        A string containing the generated answer.
+    """
+    if not groq_api_key:
+        return "Error: Groq API key is not set."
+    if not context_chunks:
+        return "I do not have enough information to answer this question based on the provided document."
+    print("Generating final answer with Groq...")
+    client = AsyncGroq(api_key=groq_api_key)
+    # Format the context for the prompt
+    context_str = "\n\n---\n\n".join(
+        [f"Context Chunk:\n{chunk['content']}" for chunk in context_chunks]
+    )
+    prompt = (
+        "You are an expert Q&A system. Your task is to extract information with 100% accuracy from the provided text. Provide a brief and direct answer."
+        "Do not mention the context in your response. Answer *only* using the information from the provided document."
+        "Do not infer, add, or assume any information that is not explicitly written in the source text. If the answer is not in the document, state that the information is not available."
+        "When the question involves numbers, percentages, or monetary values, extract the exact figures from the text."
+        "Double-check that the value corresponds to the correct plan or condition mentioned in the question."
+        "\n\n"
+        f"CONTEXT:\n{context_str}\n\n"
+        f"QUESTION:\n{query}\n\n"
+        "ANSWER:"
+    )
+    try:
+        chat_completion = await client.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            model=GROQ_MODEL_NAME,
+            temperature=0.2, # Lower temperature for more factual answers
+            max_tokens=500,
+        )
+        answer = chat_completion.choices[0].message.content
+        print("Answer generated successfully.")
+        return answer
+    except Exception as e:
+        print(f"An error occurred during Groq API call: {e}")
+        return "Could not generate an answer due to an API error."

main.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# file: main.py
+import time
+import os
+import asyncio
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, HttpUrl
+from typing import List, Dict, Any
+from dotenv import load_dotenv
+# Import functions and classes from the new modular files
+from document_processor import ingest_and_parse_document
+from chunking import process_and_chunk
+from embedding import EmbeddingClient
+from retrieval import Retriever, generate_hypothetical_document
+from generation import generate_answer
+load_dotenv()
+# --- FastAPI App Initialization ---
+app = FastAPI(
+    title="Modular RAG API",
+    description="A modular API for Retrieval-Augmented Generation from documents.",
+    version="2.0.0",
+)
+# --- Global Clients and API Keys ---
+GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
+embedding_client = EmbeddingClient()
+retriever = Retriever(embedding_client=embedding_client)
+# --- Pydantic Models ---
+class RunRequest(BaseModel):
+    document_url: HttpUrl
+    questions: List[str]
+class RunResponse(BaseModel):
+    answers: List[str]
+class TestRequest(BaseModel):
+    document_url: HttpUrl
+#Endpoints
+# --- NEW: Test Endpoint for Parsing ---
+@app.post("/test/parse", response_model=Dict[str, Any], tags=["Testing"])
+async def test_parsing_endpoint(request: TestRequest):
+    """
+    Tests the document ingestion and parsing phase.
+    Returns the full markdown content and the time taken.
+    """
+    print("--- Running Parsing Test ---")
+    start_time = time.perf_counter()
+    try:
+        markdown_content = await ingest_and_parse_document(request.document_url)
+        end_time = time.perf_counter()
+        duration = end_time - start_time
+        print(f"--- Parsing took {duration:.2f} seconds ---")
+        return {
+            "parsing_time_seconds": duration,
+            "character_count": len(markdown_content),
+            "content": markdown_content
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred during parsing: {str(e)}")
+@app.post("/hackrx/run", response_model=RunResponse)
+async def run_rag_pipeline(request: RunRequest):
+    """
+    Runs the full RAG pipeline for a given document URL and a list of questions.
+    """
+    try:
+        # --- STAGE 1 & 2: DOCUMENT INGESTION AND CHUNKING ---
+        print("--- Kicking off RAG Pipeline ---")
+        markdown_content = await ingest_and_parse_document(request.document_url)
+        documents = process_and_chunk(markdown_content)
+        if not documents:
+            raise HTTPException(status_code=400, detail="Document could not be processed into chunks.")
+        # --- STAGE 3: INDEXING (Embedding + BM25) ---
+        # This step builds the search index for the current document.
+        retriever.index(documents)
+        # --- CONCURRENT WORKFLOW FOR ALL QUESTIONS ---
+        # Step A: Concurrently generate hypothetical documents for all questions
+        hyde_tasks = [generate_hypothetical_document(q, GROQ_API_KEY) for q in request.questions]
+        all_hyde_docs = await asyncio.gather(*hyde_tasks)
+        # Step B: Concurrently retrieve relevant chunks for all questions
+        retrieval_tasks = [
+            retriever.retrieve(q, hyde_doc)
+            for q, hyde_doc in zip(request.questions, all_hyde_docs)
+        ]
+        all_retrieved_chunks = await asyncio.gather(*retrieval_tasks)
+        # Step C: Concurrently generate final answers for all questions
+        answer_tasks = [
+            generate_answer(q, chunks, GROQ_API_KEY)
+            for q, chunks in zip(request.questions, all_retrieved_chunks)
+        ]
+        final_answers = await asyncio.gather(*answer_tasks)
+        print("--- RAG Pipeline Completed Successfully ---")
+        return RunResponse(answers=final_answers)
+    except Exception as e:
+        print(f"An unhandled error occurred in the pipeline: {e}")
+        # Re-raising as a 500 error for the client
+        raise HTTPException(
+            status_code=500, detail=f"An internal server error occurred: {str(e)}"
+        )
+@app.post("/test/chunk", response_model=Dict[str, Any], tags=["Testing"])
+async def test_chunking_endpoint(request: TestRequest):
+    """
+    Tests both the parsing and chunking phases together.
+    Returns the final list of chunks and the total time taken.
+    """
+    print("--- Running Parsing and Chunking Test ---")
+    start_time = time.perf_counter()
+    try:
+        # Step 1: Parse the document
+        markdown_content = await ingest_and_parse_document(request.document_url)
+        # Step 2: Chunk the parsed content
+        documents = process_and_chunk(markdown_content)
+        end_time = time.perf_counter()
+        duration = end_time - start_time
+        print(f"--- Parsing and Chunking took {duration:.2f} seconds ---")
+        # Convert Document objects to a JSON-serializable list
+        chunk_results = [
+            {"page_content": doc.page_content, "metadata": doc.metadata}
+            for doc in documents
+        ]
+        return {
+            "total_time_seconds": duration,
+            "chunk_count": len(chunk_results),
+            "chunks": chunk_results
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred during chunking: {str(e)}")

retrieval.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# file: retrieval.py
+import time
+import asyncio
+import numpy as np
+import torch
+from groq import AsyncGroq
+from rank_bm25 import BM25Okapi
+from sentence_transformers import CrossEncoder
+from sklearn.preprocessing import MinMaxScaler
+from torch.nn.functional import cosine_similarity
+from typing import List, Dict, Tuple
+from embedding import EmbeddingClient
+from langchain_core.documents import Document
+# --- Configuration ---
+HYDE_MODEL = "llama3-8b-8192"
+RERANKER_MODEL = 'cross-encoder/ms-marco-MiniLM-L6-v2'
+INITIAL_K_CANDIDATES = 20
+TOP_K_CHUNKS = 10
+async def generate_hypothetical_document(query: str, groq_api_key: str) -> str:
+    """Generates a hypothetical document (HyDE) to enhance search."""
+    if not groq_api_key:
+        print("Groq API key not set. Skipping HyDE generation.")
+        return ""
+    print(f"Starting HyDE generation for query: '{query}'...")
+    client = AsyncGroq(api_key=groq_api_key)
+    prompt = (
+        f"Write a brief, formal passage that answers the following question. "
+        f"Use specific terminology as if it were from a larger document. "
+        f"Do not include the question or conversational text.\n\n"
+        f"Question: {query}\n\n"
+        f"Hypothetical Passage:"
+    )
+    try:
+        chat_completion = await client.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            model=HYDE_MODEL,
+            temperature=0.7,
+            max_tokens=500,
+        )
+        return chat_completion.choices[0].message.content
+    except Exception as e:
+        print(f"An error occurred during HyDE generation: {e}")
+        return ""
+class Retriever:
+    """Manages hybrid search, combining BM25, dense search, and a reranker."""
+    def __init__(self, embedding_client: EmbeddingClient):
+        self.embedding_client = embedding_client
+        self.reranker = CrossEncoder(RERANKER_MODEL, device=self.embedding_client.device)
+        self.bm25 = None
+        self.document_chunks = []
+        self.chunk_embeddings = None
+        print(f"Retriever initialized with reranker '{RERANKER_MODEL}'.")
+    def index(self, documents: List[Document]):
+        """Builds the search index from document chunks."""
+        self.document_chunks = documents
+        corpus = [doc.page_content for doc in documents]
+        if not corpus:
+            print("No documents to index.")
+            return
+        print("Indexing documents for retrieval...")
+        # 1. Initialize BM25 model
+        tokenized_corpus = [doc.split(" ") for doc in corpus]
+        self.bm25 = BM25Okapi(tokenized_corpus)
+        # 2. Compute and store dense embeddings
+        self.chunk_embeddings = self.embedding_client.create_embeddings(corpus)
+        print("Indexing complete.")
+    def _hybrid_search(self, query: str, hyde_doc: str) -> List[Tuple[int, float]]:
+        """Performs the initial hybrid search to get candidate chunks."""
+        if self.bm25 is None or self.chunk_embeddings is None:
+            raise ValueError("Retriever has not been indexed. Call index() first.")
+        # Enhance query with hypothetical document
+        enhanced_query = f"{query}\n\n{hyde_doc}" if hyde_doc else query
+        # BM25 (keyword) search
+        tokenized_query = query.split(" ")
+        bm25_scores = self.bm25.get_scores(tokenized_query)
+        # Dense (semantic) search
+        query_embedding = self.embedding_client.create_embeddings([enhanced_query])
+        dense_scores = cosine_similarity(query_embedding, self.chunk_embeddings).cpu().numpy().flatten()
+        # Normalize and combine scores
+        scaler = MinMaxScaler()
+        norm_bm25 = scaler.fit_transform(bm25_scores.reshape(-1, 1)).flatten()
+        norm_dense = scaler.fit_transform(dense_scores.reshape(-1, 1)).flatten()
+        combined_scores = 0.5 * norm_bm25 + 0.5 * norm_dense
+        # Get top initial candidates
+        top_indices = np.argsort(combined_scores)[::-1][:INITIAL_K_CANDIDATES]
+        return [(idx, combined_scores[idx]) for idx in top_indices]
+    async def _rerank(self, query: str, candidates: List[dict]) -> List[dict]:
+        """Reranks the candidate chunks using a CrossEncoder model."""
+        if not candidates:
+            return []
+        print(f"Reranking {len(candidates)} candidates...")
+        rerank_input = [[query, chunk["content"]] for chunk in candidates]
+        # Run synchronous prediction in a separate thread
+        rerank_scores = await asyncio.to_thread(
+            self.reranker.predict, rerank_input, show_progress_bar=False
+        )
+        # Combine candidates with their new scores and sort
+        for candidate, score in zip(candidates, rerank_scores):
+            candidate['rerank_score'] = score
+        candidates.sort(key=lambda x: x['rerank_score'], reverse=True)
+        return candidates[:TOP_K_CHUNKS]
+    async def retrieve(self, query: str, hyde_doc: str) -> List[Dict]:
+        """Executes the full retrieval pipeline: hybrid search followed by reranking."""
+        print(f"Retrieving documents for query: '{query}'")
+        # 1. Get initial candidates from hybrid search
+        initial_candidates_info = self._hybrid_search(query, hyde_doc)
+        retrieved_candidates = [{
+            "content": self.document_chunks[idx].page_content,
+            "metadata": self.document_chunks[idx].metadata,
+            "initial_score": score
+        } for idx, score in initial_candidates_info]
+        # 2. Rerank the candidates to get the final list
+        final_chunks = await self._rerank(query, retrieved_candidates)
+        print(f"Retrieved and reranked {len(final_chunks)} final chunks.")
+        return final_chunks