Spaces:
Running
Running
from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
def extract_pdf_text(pdf_docs): | |
""" | |
Extracts text from a list of uploaded PDF files. | |
""" | |
text = "" | |
for pdf in pdf_docs: | |
pdf_reader = PdfReader(pdf) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def split_text_into_chunks(text, chunk_size=10000, chunk_overlap=500): | |
""" | |
Splits extracted text into smaller chunks for better processing. | |
""" | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
return text_splitter.split_text(text) | |