drakosfire commited on
Commit
824bd32
·
1 Parent(s): ed70805

Reorganized the project into a module called document processing, and execute through a single entry file pdf_to_embeddings.py

Browse files
document_processing/__init__.py ADDED
File without changes
document_processing/document_converter.py ADDED
File without changes
document_processing/embedding_generator.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from docling_core.types.doc.document import DoclingDocument, DocItemLabel
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
6
+
7
+
8
+ def process_document(json_path: str, chunk_size: int = 2000, overlap: int = 500):
9
+ """Process document maintaining document structure and hierarchy."""
10
+ with open(json_path, 'r') as f:
11
+ data = json.load(f)
12
+ doc = DoclingDocument(**data)
13
+
14
+ chunks_with_metadata = []
15
+ current_headers = {} # page_no -> current header text
16
+
17
+ # Add document summary if available
18
+ if 'document_summary' in data:
19
+ chunks_with_metadata.append({
20
+ 'content': f"Document Summary:\n{data['document_summary']}",
21
+ 'page': 0, # Use 0 for document-level content
22
+ 'content_type': 'summary',
23
+ 'document_name': doc.name if hasattr(doc, 'name') else '',
24
+ })
25
+
26
+ # Process document by page
27
+ for page_no in doc.pages:
28
+ page = doc.pages[page_no]
29
+ current_header = None
30
+
31
+ # Add page summary if available
32
+ if hasattr(page, 'summary'):
33
+ chunks_with_metadata.append({
34
+ 'content': f"Page {page_no} Summary:\n{page.summary}",
35
+ 'page': page_no,
36
+ 'content_type': 'page_summary',
37
+ 'document_name': doc.name if hasattr(doc, 'name') else '',
38
+ })
39
+
40
+ # Process page content
41
+ page_items = list(doc.iterate_items(page_no=page_no))
42
+
43
+ for item, _ in page_items:
44
+ # Create base metadata
45
+ metadata = {
46
+ 'page': page_no,
47
+ 'current_section': current_header,
48
+ 'content_type': item.label.value,
49
+ 'document_name': doc.name if hasattr(doc, 'name') else '',
50
+ }
51
+
52
+ # Track section headers
53
+ if item.label == DocItemLabel.SECTION_HEADER:
54
+ current_header = item.text
55
+ current_headers[page_no] = item.text
56
+
57
+ # Convert header to markdown
58
+ md_content = f"# {item.text}\n"
59
+
60
+ # Handle regular text
61
+ elif item.label == DocItemLabel.TEXT:
62
+ # Convert text to markdown with context
63
+ md_content = ""
64
+ if current_header:
65
+ md_content += f"Context: {current_header}\n\n"
66
+ md_content += f"{item.text}\n"
67
+
68
+ else:
69
+ # Skip page headers/footers and other non-content elements
70
+ continue
71
+
72
+ # Add provenance data if available
73
+ if hasattr(item, 'prov') and item.prov:
74
+ metadata['bbox'] = item.prov[0].bbox.as_tuple()
75
+
76
+ # Split into chunks
77
+ text_splitter = RecursiveCharacterTextSplitter(
78
+ chunk_size=chunk_size,
79
+ chunk_overlap=overlap,
80
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
81
+ )
82
+
83
+ chunks = text_splitter.split_text(md_content)
84
+
85
+ # Store chunks with metadata
86
+ for chunk in chunks:
87
+ chunk_metadata = {
88
+ 'content': chunk,
89
+ **metadata,
90
+ 'original_text': item.orig,
91
+ }
92
+ chunks_with_metadata.append(chunk_metadata)
93
+
94
+ return create_dataframe(chunks_with_metadata)
95
+
96
+ def create_dataframe(chunks_with_metadata):
97
+ """Create DataFrame with content and available metadata."""
98
+ # Add index to chunks
99
+ for i, chunk in enumerate(chunks_with_metadata):
100
+ chunk['chunk_index'] = i
101
+
102
+ # Get content in specific order
103
+ contents = [c['content'] for c in chunks_with_metadata]
104
+
105
+ # Create embeddings
106
+ embeddings_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
107
+ embeddings = embeddings_model.embed_documents(contents)
108
+
109
+ # Create DataFrame with index verification and safe access to optional fields
110
+ df = pd.DataFrame({
111
+ 'chunk_index': [c['chunk_index'] for c in chunks_with_metadata],
112
+ 'content': contents,
113
+ 'embedding': [json.dumps(e) for e in embeddings],
114
+ 'page': [c.get('page', None) for c in chunks_with_metadata],
115
+ 'section': [c.get('current_section', '') for c in chunks_with_metadata],
116
+ 'content_type': [c.get('content_type', '') for c in chunks_with_metadata],
117
+ 'original_text': [c.get('original_text', '') for c in chunks_with_metadata],
118
+ 'bbox': [c.get('bbox', None) for c in chunks_with_metadata],
119
+ })
120
+
121
+ # Verify alignment
122
+ assert all(df['chunk_index'] == range(len(df))), "Chunk order mismatch!"
123
+
124
+ return df
document_processing/json_utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def save_to_file(content, filename):
4
+ with open(filename, 'w', encoding='utf-8') as f:
5
+ f.write(content)
6
+
7
+ def load_json(file_path):
8
+ with open(file_path, 'r', encoding='utf-8') as f:
9
+ return json.load(f)
10
+
11
+ def extract_text_by_page(data):
12
+ """Extract text while preserving original page structure."""
13
+ pages = {}
14
+
15
+ # Initialize pages structure
16
+ for page_no, page_data in data.get('pages', {}).items():
17
+ pages[page_no] = {
18
+ 'size': page_data['size'],
19
+ 'page_no': page_data['page_no'],
20
+ 'text_entries': {},
21
+ 'image': page_data.get('image', None)
22
+ }
23
+
24
+ # Directly process texts array
25
+ for text_item in data.get('texts', []):
26
+ if 'text' in text_item and 'prov' in text_item and text_item['prov']:
27
+ page_number = str(text_item['prov'][0]['page_no'])
28
+ if page_number in pages:
29
+ entry_number = len(pages[page_number]['text_entries']) + 1
30
+ pages[page_number]['text_entries'][f"entry_{entry_number}"] = {
31
+ 'text': text_item['text'],
32
+ 'label': text_item.get('label', ''),
33
+ 'level': text_item.get('level', None)
34
+ }
35
+
36
+ return pages
37
+
38
+ def save_enhanced_json(data, pages, output_file):
39
+ """Save enhanced JSON while preserving original structure."""
40
+ # Create a deep copy to avoid modifying the original
41
+ enhanced_data = json.loads(json.dumps(data))
42
+
43
+ # Add text entries and summaries to pages
44
+ for page_no, page_data in pages.items():
45
+ if str(page_no) in enhanced_data['pages']:
46
+ enhanced_data['pages'][str(page_no)]['text_entries'] = page_data['text_entries']
47
+ if 'summary' in page_data:
48
+ enhanced_data['pages'][str(page_no)]['summary'] = page_data['summary']
49
+
50
+ # Add document summary
51
+ if 'document_summary' in data:
52
+ enhanced_data['document_summary'] = data['document_summary']
53
+
54
+ with open(output_file, 'w', encoding='utf-8') as f:
55
+ json.dump(enhanced_data, f, indent=2, ensure_ascii=False)
document_processing/pdf_processor.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter, PdfFormatOption
2
+ from docling.datamodel.base_models import InputFormat
3
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
4
+ from document_processing.utilities import get_file_name_without_ext
5
+
6
+ def document_converter():
7
+ pipeline_options = PdfPipelineOptions()
8
+ pipeline_options.do_ocr = False
9
+ pipeline_options.do_table_structure = True
10
+
11
+ return DocumentConverter(
12
+ format_options={
13
+ InputFormat.PDF: PdfFormatOption(
14
+ pipeline_options=pipeline_options
15
+ )
16
+ }
17
+ )
18
+
19
+ def process_pdf(pdf_path):
20
+ converter = document_converter()
21
+ result = converter.convert(pdf_path)
22
+ return result
23
+
24
+ def process_pdf_file(pdf_path):
25
+ file_name = get_file_name_without_ext(pdf_path)
26
+ result = process_pdf(pdf_path)
27
+ return result
28
+
document_processing/pdf_utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
2
+ import os
3
+ import math
4
+
5
+ def estimate_conversion_time(num_pages):
6
+ """Estimate the total conversion time based on ~5 seconds per page"""
7
+ return num_pages * 5
8
+
9
+ def format_time(seconds):
10
+ """Convert seconds to a human-readable format"""
11
+ minutes = math.floor(seconds / 60)
12
+ remaining_seconds = seconds % 60
13
+ return f"{minutes} minutes and {remaining_seconds:.0f} seconds"
14
+
15
+ def check_pdf(file_path):
16
+ if not os.path.exists(file_path):
17
+ print(f"Error: File '{file_path}' does not exist.")
18
+ return False
19
+
20
+ try:
21
+ with open(file_path, 'rb') as file:
22
+ PdfReader(file)
23
+ print(f"PDF '{file_path}' can be opened successfully.")
24
+ return True
25
+ except Exception as e:
26
+ print(f"Error opening PDF '{file_path}': {str(e)}")
27
+ return False
28
+
29
+ def check_pdf_details(file_path):
30
+ try:
31
+ with open(file_path, 'rb') as file:
32
+ pdf = PdfReader(file)
33
+ num_pages = len(pdf.pages)
34
+ print(f"Number of pages: {num_pages}")
35
+ print(f"PDF Version: {pdf.pdf_header}")
36
+ print(f"File size: {os.path.getsize(file_path)} bytes")
37
+ if pdf.metadata:
38
+ print("Metadata:")
39
+ for key, value in pdf.metadata.items():
40
+ print(f" {key}: {value}")
41
+ else:
42
+ print("No metadata available")
43
+ return num_pages
44
+ except Exception as e:
45
+ print(f"Error checking PDF details: {str(e)}")
46
+ return None
document_processing/summarizer.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+
3
+ client = OpenAI()
4
+
5
+ def summarize_page(page_content):
6
+ print(f"Summarizing page: {page_content}")
7
+ page_system_prompt = "These are the text entries from a single page of a document. Please parse any messy text and concisely summarize the page. The summary will be used as context in a Retrieval Augmented Generation (RAG) application and should be focused on the critical contents of the page such as the plot, characters, setting, and important details or mechanics. Include the only page number as Page Number: <page number>, the summary."
8
+
9
+ page_summary_message = client.chat.completions.create(
10
+ model="gpt-4o",
11
+ messages=[{
12
+ "role": "user",
13
+ "content": f"{page_system_prompt} {page_content}"
14
+ }],
15
+ temperature=1,
16
+ max_tokens=512,
17
+ top_p=1,
18
+ frequency_penalty=0,
19
+ presence_penalty=0
20
+ )
21
+ return page_summary_message.choices[0].message.content
22
+
23
+ def summarize_document(all_summaries):
24
+ document_summary_prompt = "Please concisely summarize the following text. The text is a compilation of summaries of individual pages from a document. The summaries are delimited by double newlines."
25
+
26
+ document_summary_message = client.chat.completions.create(
27
+ model="gpt-4o",
28
+ messages=[{
29
+ "role": "user",
30
+ "content": f"{document_summary_prompt} {all_summaries}"
31
+ }],
32
+ temperature=1,
33
+ max_tokens=512,
34
+ top_p=1,
35
+ frequency_penalty=0,
36
+ presence_penalty=0
37
+ )
38
+ return document_summary_message.choices[0].message.content
document_processing/utilities.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ def get_file_name_without_ext(file_path):
5
+ try:
6
+ if not file_path: # Check for empty string
7
+ return None
8
+ # Extract the base name from the file path
9
+ base_name = os.path.basename(file_path)
10
+ # Remove the file extension
11
+ file_name_without_ext = os.path.splitext(base_name)[0]
12
+ return file_name_without_ext
13
+ except (TypeError, AttributeError) as e:
14
+ logging.error(f"Error processing file path: {e}")
15
+ return None
pdf_to_embeddings.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #docling_pdf.py
2
+ #This script uses Docling to convert a PDF to a JSON file.
3
+ #It then uses OpenAI to summarize each page and the entire document.
4
+ #It then saves the summarized JSON to a file.
5
+ # Instructions:
6
+ # 1. Update the source variable to the path of the PDF file you want to convert.
7
+ # 2. Run the script.
8
+ # 3. Check the output in the output folder.
9
+ # 4. Then run jsontomd.py to create the markdown file.
10
+ # 5. Then run jsonToEmbeddings.py to create the embeddings.
11
+ # 6. Point the app.py to the enhanced JSON file.
12
+ # 7. Run app.py to start the gradio web app.
13
+
14
+ import time
15
+ import os
16
+ from docling.document_converter import DocumentConverter, PdfFormatOption
17
+ from docling.datamodel.settings import DocumentLimits
18
+ from docling.datamodel.base_models import InputFormat
19
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
20
+ import json
21
+ from PyPDF2 import PdfReader
22
+ import logging
23
+ from openai import OpenAI
24
+ from tqdm import tqdm
25
+ import math
26
+ from document_processing.pdf_utils import check_pdf, check_pdf_details, estimate_conversion_time, format_time
27
+ from document_processing.json_utils import save_to_file, load_json, extract_text_by_page, save_enhanced_json
28
+ from document_processing.summarizer import summarize_page, summarize_document
29
+ from document_processing.pdf_processor import process_pdf_file
30
+ from document_processing.utilities import get_file_name_without_ext
31
+ from document_processing.embedding_generator import process_document
32
+
33
+ client = OpenAI()
34
+ # Detailed Debugging
35
+ # logging.basicConfig(level=logging.DEBUG)
36
+ # logger = logging.getLogger('docling')
37
+ # logger.setLevel(logging.DEBUG)
38
+
39
+ source = "./pdfs/test_document.pdf" # PDF path or URL
40
+
41
+ start_time = time.time()
42
+ last_step_time = start_time
43
+
44
+ # Step 1: Initialize DocumentConverter with proper options
45
+ pipeline_options = PdfPipelineOptions()
46
+ pipeline_options.do_ocr = False # adjust as needed
47
+ pipeline_options.do_table_structure = True # adjust as needed
48
+
49
+ converter = DocumentConverter(
50
+ format_options={
51
+ InputFormat.PDF: PdfFormatOption(
52
+ pipeline_options=pipeline_options
53
+ )
54
+ }
55
+ )
56
+ step1_time = time.time()
57
+ print(f"Step 1 (Initialize Converter): {step1_time - last_step_time:.2f} seconds")
58
+ print(f"Cumulative time: {step1_time - start_time:.2f} seconds")
59
+ last_step_time = step1_time
60
+
61
+ print("Document Limits:")
62
+ print(DocumentLimits())
63
+
64
+ # Before converting, check if the PDF can be opened
65
+ if check_pdf(source):
66
+ num_pages = check_pdf_details(source)
67
+ if num_pages:
68
+ estimated_time = estimate_conversion_time(num_pages)
69
+ print(f"\nEstimated conversion time: {format_time(estimated_time)}")
70
+ print("Starting conversion...\n")
71
+
72
+ # Create progress bar
73
+ with tqdm(total=100, desc="Converting PDF", unit="%") as pbar:
74
+ try:
75
+ result = process_pdf_file(source)
76
+ pbar.update(100)
77
+ step2_time = time.time()
78
+ print(f"\nStep 2 (Convert): {step2_time - last_step_time:.2f} seconds")
79
+ print(f"Cumulative time: {step2_time - start_time:.2f} seconds")
80
+ last_step_time = step2_time
81
+ except Exception as e:
82
+ print(f"Conversion failed with error: {str(e)}")
83
+ print("Traceback:")
84
+ import traceback
85
+ traceback.print_exc()
86
+ else:
87
+ print("PDF check failed. Conversion aborted.")
88
+
89
+
90
+ try:
91
+ # Replace the existing code for extracting the file name with this function call
92
+ file_name_without_ext = get_file_name_without_ext(source)
93
+
94
+ if file_name_without_ext is not None:
95
+ # Use the file name for output files
96
+ json_file = f"../output/{file_name_without_ext}_output.json"
97
+ enhanced_json_file = f"../output/{file_name_without_ext}_enhanced_output.json"
98
+ else:
99
+ # Fallback to a default name if there's an error
100
+ logging.warning("Using default file names due to error in file path processing.")
101
+ json_file = "./output/default_output.json"
102
+ enhanced_json_file = "./output/default_enhanced_output.json"
103
+ except Exception as e:
104
+ logging.error(f"Unexpected error occurred: {e}")
105
+
106
+ # Use the file name for output files
107
+ json_file = f"./output/{file_name_without_ext}_output.json"
108
+ enhanced_json_file = f"./output/{file_name_without_ext}_enhanced_output.json"
109
+
110
+ # Use the new export methods
111
+ json_output = result.document.export_to_dict() # Changed from result.render_as_dict()
112
+ formatted_json = json.dumps(json_output, indent=2, ensure_ascii=False)
113
+ save_to_file(formatted_json, json_file)
114
+ # Load JSON
115
+ data = load_json(json_file)
116
+ # print(data)
117
+
118
+ # Extract text by page
119
+ pages = extract_text_by_page(data)
120
+ # print(pages)
121
+
122
+ # Before summarizing pages, extract text from the new structure
123
+ for page in pages:
124
+ # Join only the text values from the text_entries dictionaries
125
+ page_text = "\n".join([
126
+ pages[page]['text_entries'][entry]['text'] # Access the 'text' field of each entry
127
+ for entry in pages[page]['text_entries']
128
+ ])
129
+ pages[page]['summary'] = summarize_page(page_text)
130
+
131
+ # Add all the summaries to a single string
132
+ all_summaries = "\n".join([pages[page]['summary'] for page in pages])
133
+
134
+ output = summarize_document(all_summaries)
135
+
136
+ # Add the document summary to the JSON
137
+ data['document_summary'] = output
138
+
139
+ save_enhanced_json(data, pages, enhanced_json_file)
140
+
141
+ print(f"JSON enhanced with pages object. Saved to {enhanced_json_file}")
142
+ step5_time = time.time()
143
+ print(f"Step 5 (JSON): {step5_time - last_step_time:.2f} seconds")
144
+ print(f"Cumulative time: {step5_time - start_time:.2f} seconds")
145
+
146
+ # Process the document to create embeddings and save to a CSV file
147
+ embeddings_df = process_document(enhanced_json_file)
148
+
149
+ # Save the DataFrame to a CSV file
150
+ embeddings_df.to_csv(f"./output/{file_name_without_ext}_embeddings.csv", index=False)
151
+
152
+ last_step_time = step5_time
153
+
154
+ total_time = time.time() - start_time
155
+ print(f"\nTotal execution time: {total_time:.2f} seconds")
156
+
157
+ # Add error handling for the per-page calculation
158
+ try:
159
+ if pages and len(pages) > 0:
160
+ print(f"Total time per page: {total_time / len(pages):.2f} seconds")
161
+ else:
162
+ print("Could not calculate time per page: no pages were processed")
163
+ except NameError:
164
+ print("Could not calculate time per page: conversion process did not complete")
165
+
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -7,13 +7,13 @@ readme = "README.md"
7
 
8
  [tool.poetry.dependencies]
9
  python = "^3.10"
10
- pandas = "^2.2.3"
11
- numpy = "^2.1.2"
12
- openai = "^1.53.0"
13
- gradio = "^5.4.0"
14
- sentence-transformers = "^3.2.1"
15
- torch = "^2.5.1"
16
- docling-core = "^2.3.1" # This may need to be updated to the correct version.
17
 
18
 
19
  [build-system]
 
7
 
8
  [tool.poetry.dependencies]
9
  python = "^3.10"
10
+ docling-core = "^2.4.0"
11
+ langchain = "^0.3.7"
12
+ langchain-community = "^0.3.7"
13
+ langchain-text-splitters = "^0.3.2"
14
+ sentence-transformers = "^3.3.1"
15
+ langchain-huggingface = "^0.1.2"
16
+
17
 
18
 
19
  [build-system]
pytest.ini ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [pytest]
2
+ # Add directories or files to ignore during test discovery
3
+ norecursedirs = docling/tests
4
+ pythonpath = .
5
+ markers =
6
+ dev: marks tests for development focus
7
+ allow_module_level = true