#docling_pdf.py #This script uses Docling to convert a PDF to a JSON file. #It then uses OpenAI to summarize each page and the entire document. #It then saves the summarized JSON to a file. # Instructions: # 1. Update the source variable to the path of the PDF file you want to convert. # 2. Run the script. # 3. Check the output in the output folder. # 4. Then run jsontomd.py to create the markdown file. # 5. Then run jsonToEmbeddings.py to create the embeddings. # 6. Point the app.py to the enhanced JSON file. # 7. Run app.py to start the gradio web app. import time import os from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.settings import DocumentLimits from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions import json from PyPDF2 import PdfReader import logging from openai import OpenAI from tqdm import tqdm import math from document_processing.pdf_utils import check_pdf, check_pdf_details, estimate_conversion_time, format_time from document_processing.json_utils import save_to_file, load_json, extract_text_by_page, save_enhanced_json from document_processing.summarizer import summarize_page, summarize_document from document_processing.pdf_processor import process_pdf_file from document_processing.utilities import get_file_name_without_ext from document_processing.embedding_generator import process_document client = OpenAI() # Detailed Debugging # logging.basicConfig(level=logging.DEBUG) # logger = logging.getLogger('docling') # logger.setLevel(logging.DEBUG) source = "./pdfs/test_document.pdf" # PDF path or URL start_time = time.time() last_step_time = start_time # Step 1: Initialize DocumentConverter with proper options pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False # adjust as needed pipeline_options.do_table_structure = True # adjust as needed converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options ) } ) step1_time = time.time() print(f"Step 1 (Initialize Converter): {step1_time - last_step_time:.2f} seconds") print(f"Cumulative time: {step1_time - start_time:.2f} seconds") last_step_time = step1_time print("Document Limits:") print(DocumentLimits()) # Before converting, check if the PDF can be opened if check_pdf(source): num_pages = check_pdf_details(source) if num_pages: estimated_time = estimate_conversion_time(num_pages) print(f"\nEstimated conversion time: {format_time(estimated_time)}") print("Starting conversion...\n") # Create progress bar with tqdm(total=100, desc="Converting PDF", unit="%") as pbar: try: result = process_pdf_file(source) pbar.update(100) step2_time = time.time() print(f"\nStep 2 (Convert): {step2_time - last_step_time:.2f} seconds") print(f"Cumulative time: {step2_time - start_time:.2f} seconds") last_step_time = step2_time except Exception as e: print(f"Conversion failed with error: {str(e)}") print("Traceback:") import traceback traceback.print_exc() else: print("PDF check failed. Conversion aborted.") try: # Replace the existing code for extracting the file name with this function call file_name_without_ext = get_file_name_without_ext(source) if file_name_without_ext is not None: # Use the file name for output files json_file = f"../output/{file_name_without_ext}_output.json" enhanced_json_file = f"../output/{file_name_without_ext}_enhanced_output.json" else: # Fallback to a default name if there's an error logging.warning("Using default file names due to error in file path processing.") json_file = "./output/default_output.json" enhanced_json_file = "./output/default_enhanced_output.json" except Exception as e: logging.error(f"Unexpected error occurred: {e}") # Use the file name for output files json_file = f"./output/{file_name_without_ext}_output.json" enhanced_json_file = f"./output/{file_name_without_ext}_enhanced_output.json" # Use the new export methods json_output = result.document.export_to_dict() # Changed from result.render_as_dict() formatted_json = json.dumps(json_output, indent=2, ensure_ascii=False) save_to_file(formatted_json, json_file) # Load JSON data = load_json(json_file) # print(data) # Extract text by page pages = extract_text_by_page(data) # print(pages) # Before summarizing pages, extract text from the new structure for page in pages: # Join only the text values from the text_entries dictionaries page_text = "\n".join([ pages[page]['text_entries'][entry]['text'] # Access the 'text' field of each entry for entry in pages[page]['text_entries'] ]) pages[page]['summary'] = summarize_page(page_text) # Add all the summaries to a single string all_summaries = "\n".join([pages[page]['summary'] for page in pages]) output = summarize_document(all_summaries) # Add the document summary to the JSON data['document_summary'] = output save_enhanced_json(data, pages, enhanced_json_file) print(f"JSON enhanced with pages object. Saved to {enhanced_json_file}") step5_time = time.time() print(f"Step 5 (JSON): {step5_time - last_step_time:.2f} seconds") print(f"Cumulative time: {step5_time - start_time:.2f} seconds") # Process the document to create embeddings and save to a CSV file embeddings_df = process_document(enhanced_json_file) # Save the DataFrame to a CSV file embeddings_df.to_csv(f"./output/{file_name_without_ext}_embeddings.csv", index=False) last_step_time = step5_time total_time = time.time() - start_time print(f"\nTotal execution time: {total_time:.2f} seconds") # Add error handling for the per-page calculation try: if pages and len(pages) > 0: print(f"Total time per page: {total_time / len(pages):.2f} seconds") else: print("Could not calculate time per page: no pages were processed") except NameError: print("Could not calculate time per page: conversion process did not complete")