Spaces:

drwlf
/

medical-pdf-ingestion

Sleeping

App Files Files Community

drwlf commited on Aug 1

Commit

01bc500

1 Parent(s): 5a347d8

Add medical PDF ingestion Gradio app with RAG capabilities

Browse files

Files changed (4) hide show

README.md +32 -10
app.py +99 -0
ingest.py +477 -0
requirements.txt +11 -0

README.md CHANGED Viewed

@@ -1,12 +1,34 @@
----
-title: Medical Pdf Ingestion
-emoji: 📊
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: 5.39.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# PDF Ingest and Query System
+This Gradio Space provides a powerful PDF ingestion and querying interface for building a searchable document library.
+## Features
+- **PDF Upload & Ingestion**: Upload PDF files and extract text and images using unstructured.io
+- **Intelligent Chunking**: Automatically chunks documents for optimal retrieval
+- **Vector Embeddings**: Uses BAAI/bge-m3 model for high-quality text embeddings
+- **Image Processing**: Extracts and embeds images using CLIP models
+- **Deduplication**: Prevents duplicate ingestion of the same files
+- **Semantic Search**: Query your document library using natural language
+## Usage
+1. **Upload PDFs**: Use the file upload interface to add PDF documents to your library
+2. **Ingest Documents**: Click "Ingest PDFs" to process and add them to the vector database
+3. **Query Library**: Use natural language queries to search through your ingested documents
+## Technical Details
+- **Vector Database**: ChromaDB for efficient similarity search
+- **Text Embeddings**: BAAI/bge-m3 (768-dimensional)
+- **Image Embeddings**: CLIP ViT-B/32 (512-dimensional)
+- **PDF Processing**: unstructured.io for robust document parsing
+- **UI Framework**: Gradio for interactive web interface
+## Requirements
+This space requires significant computational resources for embedding generation and may take time to process large documents.
 ---
+Built with ❤️ using Hugging Face Transformers, ChromaDB, and Gradio.

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import gradio as gr
+import os
+import subprocess
+import shutil
+from pathlib import Path
+import time
+# Function to handle file upload and ingestion
+def upload_and_ingest(uploaded_file):
+    if uploaded_file is None:
+        return "No file uploaded."
+    try:
+        # Create the pdf_docs directory if it doesn't exist
+        pdf_docs_dir = "/home/tony/pdf_docs"
+        os.makedirs(pdf_docs_dir, exist_ok=True)
+        # Copy uploaded file to pdf_docs directory
+        filename = os.path.basename(uploaded_file.name)
+        file_path = os.path.join(pdf_docs_dir, filename)
+        shutil.copy2(uploaded_file.name, file_path)
+        # Run the ingestion script and capture output
+        result = subprocess.run(
+            ["python", "/home/tony/ingest.py"],
+            cwd="/home/tony",
+            capture_output=True,
+            text=True
+        )
+        if result.returncode == 0:
+            return f"✅ File '{filename}' uploaded and ingested successfully!\n\nIngestion Output:\n{result.stdout}"
+        else:
+            return f"❌ Error during ingestion:\n{result.stderr}\n\nStdout:\n{result.stdout}"
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# Function to handle Google Drive folder link (placeholder for now)
+def link_gdrive_folder(folder_link):
+    if not folder_link or not folder_link.strip():
+        return "Please provide a Google Drive folder link."
+    # TODO: Implement Google Drive integration
+    return f"🚧 Google Drive integration coming soon!\nFolder link: {folder_link}"
+# Create Gradio Interface
+with gr.Blocks(title="PDF Ingestion Tool", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📚 PDF Ingestion Tool")
+    gr.Markdown("Upload PDF files or link Google Drive folders to ingest into the medical knowledge base.")
+    with gr.Tab("File Upload"):
+        with gr.Row():
+            file_input = gr.File(
+                label="Upload PDF File",
+                file_types=[".pdf"],
+                type="filepath"
+            )
+        upload_btn = gr.Button("Upload & Ingest", variant="primary")
+        upload_output = gr.Textbox(
+            label="Ingestion Status",
+            lines=10,
+            max_lines=20,
+            show_copy_button=True
+        )
+        upload_btn.click(
+            fn=upload_and_ingest,
+            inputs=[file_input],
+            outputs=[upload_output],
+            show_progress=True
+        )
+    with gr.Tab("Google Drive"):
+        with gr.Row():
+            gdrive_input = gr.Textbox(
+                label="Google Drive Folder Link",
+                placeholder="https://drive.google.com/drive/folders/...",
+                lines=1
+            )
+        gdrive_btn = gr.Button("Link & Ingest", variant="primary")
+        gdrive_output = gr.Textbox(
+            label="Status",
+            lines=10,
+            max_lines=20,
+            show_copy_button=True
+        )
+        gdrive_btn.click(
+            fn=link_gdrive_folder,
+            inputs=[gdrive_input],
+            outputs=[gdrive_output],
+            show_progress=True
+        )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

ingest.py ADDED Viewed

	@@ -0,0 +1,477 @@

+#!/usr/bin/env python3
+"""
+PDF Document Ingestion Script
+This script processes complex PDF documents (like medical textbooks), extracts text and images,
+chunks them intelligently, generates vector embeddings using state-of-the-art local models,
+and stores them in a local ChromaDB vector database.
+Author: Expert Python Developer
+Python Version: 3.9+
+"""
+import os
+import uuid
+import hashlib
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+import logging
+# Third-party imports
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+import chromadb
+from chromadb.config import Settings
+from unstructured.partition.pdf import partition_pdf
+from PIL import Image
+import io
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# =============================================================================
+# CONFIGURATION SECTION
+# =============================================================================
+# Input/Output Paths
+SOURCE_DIRECTORY = "/home/tony/pdf_docs"  # Directory containing PDF files to process
+DB_PATH = "/home/tony/chromadb"  # Path for persistent ChromaDB database
+IMAGE_OUTPUT_DIRECTORY = "/home/tony/extracted_images"  # Path for storing extracted images
+# Model Configuration
+TEXT_EMBEDDING_MODEL = "BAAI/bge-m3"  # State-of-the-art text embedding model
+IMAGE_EMBEDDING_MODEL = "clip-ViT-B-32"  # CLIP model for image embeddings
+# Database Configuration
+COLLECTION_NAME = "medical_library"  # ChromaDB collection name
+# Processing Configuration
+BATCH_SIZE = 100  # Number of chunks to process in each batch
+MAX_CHUNK_SIZE = 1000  # Maximum characters per text chunk
+# =============================================================================
+# INITIALIZATION FUNCTIONS
+# =============================================================================
+def initialize_chromadb() -> Tuple[chromadb.Client, chromadb.Collection]:
+    """
+    Initialize and return the ChromaDB client and collection.
+    Returns:
+        Tuple[chromadb.Client, chromadb.Collection]: The client and collection objects
+    """
+    try:
+        # Ensure database directory exists
+        os.makedirs(DB_PATH, exist_ok=True)
+        # Initialize ChromaDB client with persistent storage
+        client = chromadb.PersistentClient(
+            path=DB_PATH,
+            settings=Settings(
+                anonymized_telemetry=False,
+                allow_reset=True
+            )
+        )
+        # Get or create collection
+        try:
+            collection = client.get_collection(name=COLLECTION_NAME)
+            logger.info(f"Using existing collection: {COLLECTION_NAME}")
+        except chromadb.errors.NotFoundError:
+            collection = client.create_collection(
+                name=COLLECTION_NAME,
+                metadata={"description": "Medical textbook PDF content with embeddings"}
+            )
+            logger.info(f"Created new collection: {COLLECTION_NAME}")
+        return client, collection
+    except Exception as e:
+        logger.error(f"Failed to initialize ChromaDB: {e}")
+        raise
+def initialize_models() -> Tuple[SentenceTransformer, SentenceTransformer]:
+    """
+    Load and return the text and image embedding models.
+    Returns:
+        Tuple[SentenceTransformer, SentenceTransformer]: Text and image models
+    """
+    try:
+        logger.info("Loading text embedding model...")
+        text_model = SentenceTransformer(TEXT_EMBEDDING_MODEL)
+        logger.info("Loading image embedding model...")
+        image_model = SentenceTransformer(IMAGE_EMBEDDING_MODEL)
+        logger.info("Models loaded successfully!")
+        return text_model, image_model
+    except Exception as e:
+        logger.error(f"Failed to load models: {e}")
+        raise
+def ensure_directories() -> None:
+    """
+    Ensure all required directories exist.
+    """
+    try:
+        os.makedirs(SOURCE_DIRECTORY, exist_ok=True)
+        os.makedirs(IMAGE_OUTPUT_DIRECTORY, exist_ok=True)
+        os.makedirs(DB_PATH, exist_ok=True)
+        logger.info("All directories verified/created successfully")
+    except Exception as e:
+        logger.error(f"Failed to create directories: {e}")
+        raise
+# =============================================================================
+# DEDUPLICATION FUNCTIONS
+# =============================================================================
+def calculate_file_hash(file_path: str) -> str:
+    """
+    Calculate SHA-256 hash of a file for deduplication.
+    Args:
+        file_path (str): Path to the file
+    Returns:
+        str: SHA-256 hash of the file
+    """
+    hash_sha256 = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_sha256.update(chunk)
+    return hash_sha256.hexdigest()
+def is_pdf_already_processed(pdf_path: str, collection: chromadb.Collection) -> bool:
+    """
+    Check if a PDF has already been processed by checking its hash in the database.
+    Args:
+        pdf_path (str): Path to the PDF file
+        collection (chromadb.Collection): ChromaDB collection
+    Returns:
+        bool: True if already processed, False otherwise
+    """
+    try:
+        file_hash = calculate_file_hash(pdf_path)
+        # Query the collection for any document with this file hash
+        result = collection.get(where={"file_hash": file_hash}, limit=1)
+        if len(result['ids']) > 0:
+            pdf_filename = Path(pdf_path).name
+            logger.info(f"PDF {pdf_filename} already processed (hash: {file_hash[:12]}...). Skipping.")
+            return True
+        return False
+    except Exception as e:
+        logger.warning(f"Error checking if PDF is already processed: {e}")
+        return False
+# =============================================================================
+# DOCUMENT PROCESSING FUNCTIONS
+# =============================================================================
+def process_pdf(
+    pdf_path: str,
+    text_model: SentenceTransformer,
+    image_model: SentenceTransformer,
+    collection: chromadb.Collection
+) -> None:
+    """
+    Process a single PDF file and store chunks in ChromaDB.
+    Args:
+        pdf_path (str): Path to the PDF file
+        text_model (SentenceTransformer): Text embedding model
+        image_model (SentenceTransformer): Image embedding model
+        collection (chromadb.Collection): ChromaDB collection
+    """
+    try:
+        pdf_filename = Path(pdf_path).name
+        logger.info(f"Processing PDF: {pdf_filename}")
+        # Calculate file hash for deduplication
+        file_hash = calculate_file_hash(pdf_path)
+        # Parse PDF with unstructured
+        elements = partition_pdf(
+            filename=pdf_path,
+            strategy="hi_res",
+            extract_images_in_pdf=True,
+            infer_table_structure=True
+        )
+        if not elements:
+            logger.warning(f"No elements extracted from {pdf_filename}")
+            return
+        # Generate chunks from elements
+        chunks = create_chunks_from_elements(elements, pdf_filename, file_hash)
+        if not chunks:
+            logger.warning(f"No chunks created from {pdf_filename}")
+            return
+        # Process chunks in batches
+        process_chunks_in_batches(chunks, text_model, image_model, collection)
+        logger.info(f"Successfully processed {pdf_filename}: {len(chunks)} chunks")
+    except Exception as e:
+        logger.error(f"Error processing PDF {pdf_path}: {e}")
+        raise
+def create_chunks_from_elements(elements: List, pdf_filename: str, file_hash: str) -> List[Dict[str, Any]]:
+    """
+    Create chunks from unstructured elements (let unstructured handle the intelligent parsing).
+    Args:
+        elements (List): List of unstructured elements
+        pdf_filename (str): Name of the source PDF file
+        file_hash (str): SHA-256 hash of the PDF file for deduplication
+    Returns:
+        List[Dict[str, Any]]: List of chunk dictionaries
+    """
+    chunks = []
+    for i, element in enumerate(elements):
+        try:
+            element_type = element.category
+            page_number = getattr(element.metadata, 'page_number', 1)
+            # Handle image elements
+            if element_type == "Image" and hasattr(element, 'image_bytes'):
+                # Save image and create image chunk
+                image_path = save_image(element.image_bytes, pdf_filename, i)
+                if image_path:
+                    chunks.append({
+                        'id': f"{pdf_filename}_img_{i}",
+                        'content': image_path,
+                        'type': 'image',
+                        'metadata': {
+                            'source_file': pdf_filename,
+                            'page_number': page_number,
+                            'element_type': element_type,
+                            'image_path': image_path,
+                            'file_hash': file_hash
+                        }
+                    })
+            # Handle all text elements as individual chunks (unstructured already did the intelligent parsing)
+            else:
+                text_content = str(element).strip()
+                if text_content and len(text_content) > 20:  # Skip very short fragments
+                    chunks.append({
+                        'id': f"{pdf_filename}_text_{i}",
+                        'content': text_content,
+                        'type': 'text',
+                        'metadata': {
+                            'source_file': pdf_filename,
+                            'page_number': page_number,
+                            'element_type': element_type,
+                            'file_hash': file_hash
+                        }
+                    })
+        except Exception as e:
+            logger.warning(f"Error processing element {i}: {e}")
+            continue
+    return chunks
+def save_image(image_bytes: bytes, pdf_filename: str, chunk_index: int) -> Optional[str]:
+    """
+    Save image bytes to file and return the path.
+    Args:
+        image_bytes (bytes): Raw image data
+        pdf_filename (str): Source PDF filename
+        chunk_index (int): Index of the chunk
+    Returns:
+        Optional[str]: Path to saved image or None if failed
+    """
+    try:
+        # Create unique filename
+        image_filename = f"{Path(pdf_filename).stem}_{chunk_index}_{uuid.uuid4().hex[:8]}.png"
+        image_path = os.path.join(IMAGE_OUTPUT_DIRECTORY, image_filename)
+        # Convert and save image
+        image = Image.open(io.BytesIO(image_bytes))
+        image.save(image_path, format='PNG')
+        return image_path
+    except Exception as e:
+        logger.warning(f"Failed to save image: {e}")
+        return None
+def process_chunks_in_batches(
+    chunks: List[Dict[str, Any]],
+    text_model: SentenceTransformer,
+    image_model: SentenceTransformer,
+    collection: chromadb.Collection
+) -> None:
+    """
+    Process chunks in batches and store in ChromaDB.
+    Args:
+        chunks (List[Dict[str, Any]]): List of chunks to process
+        text_model (SentenceTransformer): Text embedding model
+        image_model (SentenceTransformer): Image embedding model
+        collection (chromadb.Collection): ChromaDB collection
+    """
+    for i in range(0, len(chunks), BATCH_SIZE):
+        batch = chunks[i:i + BATCH_SIZE]
+        try:
+            process_batch(batch, text_model, image_model, collection)
+        except Exception as e:
+            logger.error(f"Error processing batch {i//BATCH_SIZE + 1}: {e}")
+            # Continue with next batch instead of failing completely
+            continue
+def process_batch(
+    batch: List[Dict[str, Any]],
+    text_model: SentenceTransformer,
+    image_model: SentenceTransformer,
+    collection: chromadb.Collection
+) -> None:
+    """
+    Process a single batch of chunks.
+    Args:
+        batch (List[Dict[str, Any]]): Batch of chunks to process
+        text_model (SentenceTransformer): Text embedding model
+        image_model (SentenceTransformer): Image embedding model
+        collection (chromadb.Collection): ChromaDB collection
+    """
+    ids = []
+    embeddings = []
+    metadatas = []
+    documents = []
+    for chunk in batch:
+        try:
+            chunk_id = chunk['id']
+            content = chunk['content']
+            chunk_type = chunk['type']
+            metadata = chunk['metadata']
+            # Generate embedding based on type
+            if chunk_type == 'text':
+                embedding = text_model.encode(content).tolist()
+                document = content
+            elif chunk_type == 'image':
+                # For images, encode the image file
+                if os.path.exists(content):
+                    embedding = image_model.encode(Image.open(content)).tolist()
+                    document = f"Image from {metadata['source_file']} page {metadata['page_number']}"
+                else:
+                    logger.warning(f"Image file not found: {content}")
+                    continue
+            else:
+                logger.warning(f"Unknown chunk type: {chunk_type}")
+                continue
+            ids.append(chunk_id)
+            embeddings.append(embedding)
+            metadatas.append(metadata)
+            documents.append(document)
+        except Exception as e:
+            logger.warning(f"Error processing chunk {chunk.get('id', 'unknown')}: {e}")
+            continue
+    # Add batch to collection
+    if ids:
+        try:
+            collection.add(
+                ids=ids,
+                embeddings=embeddings,
+                metadatas=metadatas,
+                documents=documents
+            )
+            logger.debug(f"Added batch of {len(ids)} chunks to database")
+        except Exception as e:
+            logger.error(f"Error adding batch to database: {e}")
+            raise
+# =============================================================================
+# MAIN EXECUTION
+# =============================================================================
+def main():
+    """
+    Main execution function.
+    """
+    try:
+        logger.info("Starting PDF ingestion process...")
+        # Ensure directories exist
+        ensure_directories()
+        # Initialize models and database
+        logger.info("Initializing models and database...")
+        text_model, image_model = initialize_models()
+        client, collection = initialize_chromadb()
+        # Get list of PDF files
+        pdf_files = []
+        if os.path.exists(SOURCE_DIRECTORY):
+            pdf_files = [f for f in os.listdir(SOURCE_DIRECTORY) if f.lower().endswith('.pdf')]
+        if not pdf_files:
+            logger.warning(f"No PDF files found in {SOURCE_DIRECTORY}")
+            logger.info("Please add PDF files to the source directory and run again.")
+            return
+        logger.info(f"Found {len(pdf_files)} PDF files to process")
+        # Process each PDF file with progress bar
+        with tqdm(pdf_files, desc="Processing PDFs") as pbar:
+            for pdf_file in pbar:
+                pdf_path = os.path.join(SOURCE_DIRECTORY, pdf_file)
+                pbar.set_description(f"Processing {pdf_file}")
+                # Check if this PDF has already been processed
+                if is_pdf_already_processed(pdf_path, collection):
+                    continue
+                try:
+                    process_pdf(pdf_path, text_model, image_model, collection)
+                except Exception as e:
+                    logger.error(f"Failed to process {pdf_file}: {e}")
+                    continue
+        # Get final statistics
+        try:
+            count = collection.count()
+            logger.info(f"Ingestion complete! Total chunks in database: {count}")
+        except Exception as e:
+            logger.warning(f"Could not get final count: {e}")
+        logger.info("PDF ingestion process completed successfully!")
+    except Exception as e:
+        logger.error(f"Fatal error in main execution: {e}")
+        raise
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==5.39.0
+transformers==4.49.1
+torch>=2.0.0
+chromadb==0.5.2
+sentence-transformers==3.4.0
+unstructured[all-docs]==0.18.5
+pillow>=10.0.0
+numpy>=1.24.0
+pandas>=2.0.0
+tqdm>=4.65.0
+clip-by-openai