ChadGPT / chunking.py
PercivalFletcher's picture
Upload 6 files
a19a241 verified
raw
history blame
2.74 kB
# file: chunking.py
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List
from unstructured.partition.md import partition_md
from unstructured.documents.elements import Header, Footer, PageBreak, Table, NarrativeText
# --- Configuration ---
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
def process_and_chunk(raw_text: str) -> List[Document]:
"""
Partitions raw text from a document processor using 'unstructured',
correctly interpreting it as markdown to preserve table structures,
and then chunks the remaining text content.
Args:
raw_text: The raw string content of the document (expected to be markdown).
Returns:
A list of Document objects, including structured tables and chunked text.
"""
if not raw_text:
print("Warning: Input text for chunking is empty.")
return []
print(f"Processing raw text of length {len(raw_text)} with 'unstructured' markdown parser.")
# --- FIX: Change content_type to "text/markdown" ---
# This tells unstructured to use its specialized markdown parser, which
# correctly handles tables and other structures from your PyMuPDF output.
elements = partition_md(text=raw_text)
documents = []
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
length_function=len,
is_separator_regex=False,
)
for element in elements:
if isinstance(element, (Header, Footer, PageBreak)):
continue
# Process tables
if "unstructured.documents.elements.Table" in str(type(element)):
table_html = element.metadata.text_as_html
table_metadata = element.metadata.to_dict()
table_metadata['content_type'] = 'table'
documents.append(Document(page_content=table_html, metadata=table_metadata))
# Process and chunk narrative text
elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
chunks = text_splitter.split_text(element.text)
for chunk in chunks:
chunk_metadata = element.metadata.to_dict()
chunk_metadata['content_type'] = 'text'
documents.append(Document(page_content=chunk, metadata=chunk_metadata))
# Handle other elements directly
else:
general_metadata = element.metadata.to_dict()
general_metadata['content_type'] = 'other'
documents.append(Document(page_content=element.text, metadata=general_metadata))
print(f"Created {len(documents)} documents (chunks and tables).")
return documents