# file: chunking.py from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document from typing import List from unstructured.partition.md import partition_md from unstructured.documents.elements import Header, Footer, PageBreak, Table, NarrativeText # --- Configuration --- CHUNK_SIZE = 1000 CHUNK_OVERLAP = 200 def process_and_chunk(raw_text: str) -> List[Document]: """ Partitions raw text from a document processor using 'unstructured', correctly interpreting it as markdown to preserve table structures, and then chunks the remaining text content. Args: raw_text: The raw string content of the document (expected to be markdown). Returns: A list of Document objects, including structured tables and chunked text. """ if not raw_text: print("Warning: Input text for chunking is empty.") return [] print(f"Processing raw text of length {len(raw_text)} with 'unstructured' markdown parser.") # --- FIX: Change content_type to "text/markdown" --- # This tells unstructured to use its specialized markdown parser, which # correctly handles tables and other structures from your PyMuPDF output. elements = partition_md(text=raw_text) documents = [] text_splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len, is_separator_regex=False, ) for element in elements: if isinstance(element, (Header, Footer, PageBreak)): continue # Process tables if "unstructured.documents.elements.Table" in str(type(element)): table_html = element.metadata.text_as_html table_metadata = element.metadata.to_dict() table_metadata['content_type'] = 'table' documents.append(Document(page_content=table_html, metadata=table_metadata)) # Process and chunk narrative text elif "unstructured.documents.elements.NarrativeText" in str(type(element)): chunks = text_splitter.split_text(element.text) for chunk in chunks: chunk_metadata = element.metadata.to_dict() chunk_metadata['content_type'] = 'text' documents.append(Document(page_content=chunk, metadata=chunk_metadata)) # Handle other elements directly else: general_metadata = element.metadata.to_dict() general_metadata['content_type'] = 'other' documents.append(Document(page_content=element.text, metadata=general_metadata)) print(f"Created {len(documents)} documents (chunks and tables).") return documents