# file: pdf_parallel_parser.py import fitz # PyMuPDF from PIL import Image import io import os from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path # Import the specialized parsers from our other module from complex_parser import process_table_element, process_image_element def _is_bbox_contained(inner_bbox, outer_bbox): """Check if inner_bbox is fully inside outer_bbox.""" return (inner_bbox[0] >= outer_bbox[0] and inner_bbox[1] >= outer_bbox[1] and inner_bbox[2] <= outer_bbox[2] and inner_bbox[3] <= outer_bbox[3]) def _process_page(page: fitz.Page) -> str: """ Processes a single PDF page to extract text, tables, and images. - Tables are found and processed with the complex_parser. - Plain text is extracted, excluding any text already inside a processed table. """ page_content = [] # 1. Find and process tables first table_bboxes = [] try: tables = page.find_tables() pix = page.get_pixmap(dpi=200) page_image = Image.open(io.BytesIO(pix.tobytes("png"))) print(f"Page {page.number}: Found {len(tables.tables)} potential tables.") for i, table in enumerate(tables): table_bboxes.append(table.bbox) table_image = page_image.crop(table.bbox) markdown_table = process_table_element(table_image) page_content.append(markdown_table) except Exception as e: print(f"Could not process tables on page {page.number}: {e}") # 2. Extract text blocks, excluding those within table bounding boxes text_blocks = page.get_text("blocks") for block in text_blocks: block_bbox = block[:4] # Check if this text block is inside any of the tables we just processed is_in_table = any(_is_bbox_contained(block_bbox, table_bbox) for table_bbox in table_bboxes) if not is_in_table: page_content.append(block[4].strip()) # Note: Image extraction can be added here if needed, similar to table extraction. return "\n".join(page_content) def process_pdf_with_hybrid_parallel_sync(file_path: Path) -> str: """ Processes a PDF file in parallel using PyMuPDF and the complex_parser. """ print(f"Processing PDF '{file_path.name}' with parallel page-by-page strategy...") all_page_texts = [] doc = fitz.open(file_path) with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as executor: futures = {executor.submit(_process_page, page): page.number for page in doc} # Collect results in page order results = ["" for _ in range(len(doc))] for future in as_completed(futures): page_num = futures[future] try: results[page_num] = future.result() except Exception as e: print(f"Error processing page {page_num}: {e}") all_page_texts = results return f"\n\n--- Page Break ---\n\n".join(all_page_texts)