Sutra_AI / utils /pdf_utils.py
Inferno-721's picture
Initial
0753d2e
import PyPDF2
from typing import List, Dict
class PDFProcessor:
def __init__(self):
self.pages = {}
def extract_text(self, pdf_file) -> Dict[int, str]:
"""Extract text from PDF and return a dictionary of page numbers and text."""
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
text = pdf_reader.pages[page_num].extract_text()
self.pages[page_num] = text
return self.pages
def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
"""Split text into chunks of specified size."""
words = text.split()
chunks = []
current_chunk = []
current_size = 0
for word in words:
current_size += len(word) + 1 # +1 for space
if current_size > chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_size = len(word)
else:
current_chunk.append(word)
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks