DocuChat / src /pdf_processing.py
TejaCherukuri
add source code
59e5e32
raw
history blame contribute delete
669 Bytes
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def extract_pdf_text(pdf_docs):
"""
Extracts text from a list of uploaded PDF files.
"""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def split_text_into_chunks(text, chunk_size=10000, chunk_overlap=500):
"""
Splits extracted text into smaller chunks for better processing.
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return text_splitter.split_text(text)