Spaces:

TheDrakosfire
/

SwordsAndSorceryRulesLawyer

Runtime error

App Files Files Community

drakosfire commited on Nov 20, 2024

Commit

824bd32

1 Parent(s): ed70805

Reorganized the project into a module called document processing, and execute through a single entry file pdf_to_embeddings.py

Browse files

Files changed (12) hide show

document_processing/__init__.py +0 -0
document_processing/document_converter.py +0 -0
document_processing/embedding_generator.py +124 -0
document_processing/json_utils.py +55 -0
document_processing/pdf_processor.py +28 -0
document_processing/pdf_utils.py +46 -0
document_processing/summarizer.py +38 -0
document_processing/utilities.py +15 -0
pdf_to_embeddings.py +165 -0
poetry.lock +0 -0
pyproject.toml +7 -7
pytest.ini +7 -0

document_processing/__init__.py ADDED Viewed

File without changes

document_processing/document_converter.py ADDED Viewed

File without changes

document_processing/embedding_generator.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import json
+import pandas as pd
+from docling_core.types.doc.document import DoclingDocument, DocItemLabel
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+def process_document(json_path: str, chunk_size: int = 2000, overlap: int = 500):
+    """Process document maintaining document structure and hierarchy."""
+    with open(json_path, 'r') as f:
+        data = json.load(f)
+    doc = DoclingDocument(**data)
+    chunks_with_metadata = []
+    current_headers = {}  # page_no -> current header text
+    # Add document summary if available
+    if 'document_summary' in data:
+        chunks_with_metadata.append({
+            'content': f"Document Summary:\n{data['document_summary']}",
+            'page': 0,  # Use 0 for document-level content
+            'content_type': 'summary',
+            'document_name': doc.name if hasattr(doc, 'name') else '',
+        })
+    # Process document by page
+    for page_no in doc.pages:
+        page = doc.pages[page_no]
+        current_header = None
+        # Add page summary if available
+        if hasattr(page, 'summary'):
+            chunks_with_metadata.append({
+                'content': f"Page {page_no} Summary:\n{page.summary}",
+                'page': page_no,
+                'content_type': 'page_summary',
+                'document_name': doc.name if hasattr(doc, 'name') else '',
+            })
+        # Process page content
+        page_items = list(doc.iterate_items(page_no=page_no))
+        for item, _ in page_items:
+            # Create base metadata
+            metadata = {
+                'page': page_no,
+                'current_section': current_header,
+                'content_type': item.label.value,
+                'document_name': doc.name if hasattr(doc, 'name') else '',
+            }
+            # Track section headers
+            if item.label == DocItemLabel.SECTION_HEADER:
+                current_header = item.text
+                current_headers[page_no] = item.text
+                # Convert header to markdown
+                md_content = f"# {item.text}\n"
+            # Handle regular text
+            elif item.label == DocItemLabel.TEXT:
+                # Convert text to markdown with context
+                md_content = ""
+                if current_header:
+                    md_content += f"Context: {current_header}\n\n"
+                md_content += f"{item.text}\n"
+            else:
+                # Skip page headers/footers and other non-content elements
+                continue
+            # Add provenance data if available
+            if hasattr(item, 'prov') and item.prov:
+                metadata['bbox'] = item.prov[0].bbox.as_tuple()
+            # Split into chunks
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size,
+                chunk_overlap=overlap,
+                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
+            )
+            chunks = text_splitter.split_text(md_content)
+            # Store chunks with metadata
+            for chunk in chunks:
+                chunk_metadata = {
+                    'content': chunk,
+                    **metadata,
+                    'original_text': item.orig,
+                }
+                chunks_with_metadata.append(chunk_metadata)
+    return create_dataframe(chunks_with_metadata)
+def create_dataframe(chunks_with_metadata):
+    """Create DataFrame with content and available metadata."""
+    # Add index to chunks
+    for i, chunk in enumerate(chunks_with_metadata):
+        chunk['chunk_index'] = i
+    # Get content in specific order
+    contents = [c['content'] for c in chunks_with_metadata]
+    # Create embeddings
+    embeddings_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
+    embeddings = embeddings_model.embed_documents(contents)
+    # Create DataFrame with index verification and safe access to optional fields
+    df = pd.DataFrame({
+        'chunk_index': [c['chunk_index'] for c in chunks_with_metadata],
+        'content': contents,
+        'embedding': [json.dumps(e) for e in embeddings],
+        'page': [c.get('page', None) for c in chunks_with_metadata],
+        'section': [c.get('current_section', '') for c in chunks_with_metadata],
+        'content_type': [c.get('content_type', '') for c in chunks_with_metadata],
+        'original_text': [c.get('original_text', '') for c in chunks_with_metadata],
+        'bbox': [c.get('bbox', None) for c in chunks_with_metadata],
+    })
+    # Verify alignment
+    assert all(df['chunk_index'] == range(len(df))), "Chunk order mismatch!"
+    return df

document_processing/json_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import json
+def save_to_file(content, filename):
+    with open(filename, 'w', encoding='utf-8') as f:
+        f.write(content)
+def load_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def extract_text_by_page(data):
+    """Extract text while preserving original page structure."""
+    pages = {}
+    # Initialize pages structure
+    for page_no, page_data in data.get('pages', {}).items():
+        pages[page_no] = {
+            'size': page_data['size'],
+            'page_no': page_data['page_no'],
+            'text_entries': {},
+            'image': page_data.get('image', None)
+        }
+    # Directly process texts array
+    for text_item in data.get('texts', []):
+        if 'text' in text_item and 'prov' in text_item and text_item['prov']:
+            page_number = str(text_item['prov'][0]['page_no'])
+            if page_number in pages:
+                entry_number = len(pages[page_number]['text_entries']) + 1
+                pages[page_number]['text_entries'][f"entry_{entry_number}"] = {
+                    'text': text_item['text'],
+                    'label': text_item.get('label', ''),
+                    'level': text_item.get('level', None)
+                }
+    return pages
+def save_enhanced_json(data, pages, output_file):
+    """Save enhanced JSON while preserving original structure."""
+    # Create a deep copy to avoid modifying the original
+    enhanced_data = json.loads(json.dumps(data))
+    # Add text entries and summaries to pages
+    for page_no, page_data in pages.items():
+        if str(page_no) in enhanced_data['pages']:
+            enhanced_data['pages'][str(page_no)]['text_entries'] = page_data['text_entries']
+            if 'summary' in page_data:
+                enhanced_data['pages'][str(page_no)]['summary'] = page_data['summary']
+    # Add document summary
+    if 'document_summary' in data:
+        enhanced_data['document_summary'] = data['document_summary']
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(enhanced_data, f, indent=2, ensure_ascii=False)

document_processing/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from document_processing.utilities import get_file_name_without_ext
+def document_converter():
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = False
+    pipeline_options.do_table_structure = True
+    return DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options
+            )
+        }
+    )
+def process_pdf(pdf_path):
+    converter = document_converter()
+    result = converter.convert(pdf_path)
+    return result
+def process_pdf_file(pdf_path):
+    file_name = get_file_name_without_ext(pdf_path)
+    result = process_pdf(pdf_path)
+    return result

document_processing/pdf_utils.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from PyPDF2 import PdfReader
+import os
+import math
+def estimate_conversion_time(num_pages):
+    """Estimate the total conversion time based on ~5 seconds per page"""
+    return num_pages * 5
+def format_time(seconds):
+    """Convert seconds to a human-readable format"""
+    minutes = math.floor(seconds / 60)
+    remaining_seconds = seconds % 60
+    return f"{minutes} minutes and {remaining_seconds:.0f} seconds"
+def check_pdf(file_path):
+    if not os.path.exists(file_path):
+        print(f"Error: File '{file_path}' does not exist.")
+        return False
+    try:
+        with open(file_path, 'rb') as file:
+            PdfReader(file)
+        print(f"PDF '{file_path}' can be opened successfully.")
+        return True
+    except Exception as e:
+        print(f"Error opening PDF '{file_path}': {str(e)}")
+        return False
+def check_pdf_details(file_path):
+    try:
+        with open(file_path, 'rb') as file:
+            pdf = PdfReader(file)
+            num_pages = len(pdf.pages)
+            print(f"Number of pages: {num_pages}")
+            print(f"PDF Version: {pdf.pdf_header}")
+            print(f"File size: {os.path.getsize(file_path)} bytes")
+            if pdf.metadata:
+                print("Metadata:")
+                for key, value in pdf.metadata.items():
+                    print(f"  {key}: {value}")
+            else:
+                print("No metadata available")
+            return num_pages
+    except Exception as e:
+        print(f"Error checking PDF details: {str(e)}")
+        return None

document_processing/summarizer.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from openai import OpenAI
+client = OpenAI()
+def summarize_page(page_content):
+    print(f"Summarizing page: {page_content}")
+    page_system_prompt = "These are the text entries from a single page of a document. Please parse any messy text and concisely summarize the page. The summary will be used as context in a Retrieval Augmented Generation (RAG) application and should be focused on the critical contents of the page such as the plot, characters, setting, and important details or mechanics. Include the only page number as Page Number: <page number>, the summary."
+    page_summary_message = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[{
+            "role": "user",
+            "content": f"{page_system_prompt} {page_content}"
+        }],
+        temperature=1,
+        max_tokens=512,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0
+    )
+    return page_summary_message.choices[0].message.content
+def summarize_document(all_summaries):
+    document_summary_prompt = "Please concisely summarize the following text. The text is a compilation of summaries of individual pages from a document. The summaries are delimited by double newlines."
+    document_summary_message = client.chat.completions.create(
+        model="gpt-4o",
+        messages=[{
+            "role": "user",
+            "content": f"{document_summary_prompt} {all_summaries}"
+        }],
+        temperature=1,
+        max_tokens=512,
+        top_p=1,
+        frequency_penalty=0,
+        presence_penalty=0
+    )
+    return document_summary_message.choices[0].message.content

document_processing/utilities.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import logging
+def get_file_name_without_ext(file_path):
+    try:
+        if not file_path:  # Check for empty string
+            return None
+        # Extract the base name from the file path
+        base_name = os.path.basename(file_path)
+        # Remove the file extension
+        file_name_without_ext = os.path.splitext(base_name)[0]
+        return file_name_without_ext
+    except (TypeError, AttributeError) as e:
+        logging.error(f"Error processing file path: {e}")
+        return None

pdf_to_embeddings.py ADDED Viewed

	@@ -0,0 +1,165 @@

+#docling_pdf.py
+#This script uses Docling to convert a PDF to a JSON file.
+#It then uses OpenAI to summarize each page and the entire document.
+#It then saves the summarized JSON to a file.
+# Instructions:
+# 1. Update the source variable to the path of the PDF file you want to convert.
+# 2. Run the script.
+# 3. Check the output in the output folder.
+# 4. Then run jsontomd.py to create the markdown file.
+# 5. Then run jsonToEmbeddings.py to create the embeddings.
+# 6. Point the app.py to the enhanced JSON file.
+# 7. Run app.py to start the gradio web app.
+import time
+import os
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.settings import DocumentLimits
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+import json
+from PyPDF2 import PdfReader
+import logging
+from openai import OpenAI
+from tqdm import tqdm
+import math
+from document_processing.pdf_utils import check_pdf, check_pdf_details, estimate_conversion_time, format_time
+from document_processing.json_utils import save_to_file, load_json, extract_text_by_page, save_enhanced_json
+from document_processing.summarizer import summarize_page, summarize_document
+from document_processing.pdf_processor import process_pdf_file
+from document_processing.utilities import get_file_name_without_ext
+from document_processing.embedding_generator import process_document
+client = OpenAI()
+# Detailed Debugging
+# logging.basicConfig(level=logging.DEBUG)
+# logger = logging.getLogger('docling')
+# logger.setLevel(logging.DEBUG)
+source = "./pdfs/test_document.pdf"  # PDF path or URL
+start_time = time.time()
+last_step_time = start_time
+# Step 1: Initialize DocumentConverter with proper options
+pipeline_options = PdfPipelineOptions()
+pipeline_options.do_ocr = False  # adjust as needed
+pipeline_options.do_table_structure = True  # adjust as needed
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_options=pipeline_options
+        )
+    }
+)
+step1_time = time.time()
+print(f"Step 1 (Initialize Converter): {step1_time - last_step_time:.2f} seconds")
+print(f"Cumulative time: {step1_time - start_time:.2f} seconds")
+last_step_time = step1_time
+print("Document Limits:")
+print(DocumentLimits())
+# Before converting, check if the PDF can be opened
+if check_pdf(source):
+    num_pages = check_pdf_details(source)
+    if num_pages:
+        estimated_time = estimate_conversion_time(num_pages)
+        print(f"\nEstimated conversion time: {format_time(estimated_time)}")
+        print("Starting conversion...\n")
+        # Create progress bar
+        with tqdm(total=100, desc="Converting PDF", unit="%") as pbar:
+            try:
+                result = process_pdf_file(source)
+                pbar.update(100)
+                step2_time = time.time()
+                print(f"\nStep 2 (Convert): {step2_time - last_step_time:.2f} seconds")
+                print(f"Cumulative time: {step2_time - start_time:.2f} seconds")
+                last_step_time = step2_time
+            except Exception as e:
+                print(f"Conversion failed with error: {str(e)}")
+                print("Traceback:")
+                import traceback
+                traceback.print_exc()
+else:
+    print("PDF check failed. Conversion aborted.")
+try:
+    # Replace the existing code for extracting the file name with this function call
+    file_name_without_ext = get_file_name_without_ext(source)
+    if file_name_without_ext is not None:
+        # Use the file name for output files
+        json_file = f"../output/{file_name_without_ext}_output.json"
+        enhanced_json_file = f"../output/{file_name_without_ext}_enhanced_output.json"
+    else:
+        # Fallback to a default name if there's an error
+        logging.warning("Using default file names due to error in file path processing.")
+        json_file = "./output/default_output.json"
+        enhanced_json_file = "./output/default_enhanced_output.json"
+except Exception as e:
+    logging.error(f"Unexpected error occurred: {e}")
+# Use the file name for output files
+json_file = f"./output/{file_name_without_ext}_output.json"
+enhanced_json_file = f"./output/{file_name_without_ext}_enhanced_output.json"
+# Use the new export methods
+json_output = result.document.export_to_dict()  # Changed from result.render_as_dict()
+formatted_json = json.dumps(json_output, indent=2, ensure_ascii=False)
+save_to_file(formatted_json, json_file)
+# Load JSON
+data = load_json(json_file)
+# print(data)
+# Extract text by page
+pages = extract_text_by_page(data)
+# print(pages)
+# Before summarizing pages, extract text from the new structure
+for page in pages:
+    # Join only the text values from the text_entries dictionaries
+    page_text = "\n".join([
+        pages[page]['text_entries'][entry]['text']  # Access the 'text' field of each entry
+        for entry in pages[page]['text_entries']
+    ])
+    pages[page]['summary'] = summarize_page(page_text)
+# Add all the summaries to a single string
+all_summaries = "\n".join([pages[page]['summary'] for page in pages])
+output = summarize_document(all_summaries)
+# Add the document summary to the JSON
+data['document_summary'] = output
+save_enhanced_json(data, pages, enhanced_json_file)
+print(f"JSON enhanced with pages object. Saved to {enhanced_json_file}")
+step5_time = time.time()
+print(f"Step 5 (JSON): {step5_time - last_step_time:.2f} seconds")
+print(f"Cumulative time: {step5_time - start_time:.2f} seconds")
+# Process the document to create embeddings and save to a CSV file
+embeddings_df = process_document(enhanced_json_file)
+# Save the DataFrame to a CSV file
+embeddings_df.to_csv(f"./output/{file_name_without_ext}_embeddings.csv", index=False)
+last_step_time = step5_time
+total_time = time.time() - start_time
+print(f"\nTotal execution time: {total_time:.2f} seconds")
+# Add error handling for the per-page calculation
+try:
+    if pages and len(pages) > 0:
+        print(f"Total time per page: {total_time / len(pages):.2f} seconds")
+    else:
+        print("Could not calculate time per page: no pages were processed")
+except NameError:
+    print("Could not calculate time per page: conversion process did not complete")

poetry.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -7,13 +7,13 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.10"
-pandas = "^2.2.3"
-numpy = "^2.1.2"
-openai = "^1.53.0"
-gradio = "^5.4.0"
-sentence-transformers = "^3.2.1"
-torch = "^2.5.1"
-docling-core = "^2.3.1" # This may need to be updated to the correct version.
 [build-system]

 [tool.poetry.dependencies]
 python = "^3.10"
+docling-core = "^2.4.0"
+langchain = "^0.3.7"
+langchain-community = "^0.3.7"
+langchain-text-splitters = "^0.3.2"
+sentence-transformers = "^3.3.1"
+langchain-huggingface = "^0.1.2"
 [build-system]

pytest.ini ADDED Viewed

	@@ -0,0 +1,7 @@

+[pytest]
+# Add directories or files to ignore during test discovery
+norecursedirs = docling/tests
+pythonpath = .
+markers =
+    dev: marks tests for development focus
+allow_module_level = true