Spaces:
Sleeping
Sleeping
import PyPDF2 | |
from typing import List, Dict | |
class PDFProcessor: | |
def __init__(self): | |
self.pages = {} | |
def extract_text(self, pdf_file) -> Dict[int, str]: | |
"""Extract text from PDF and return a dictionary of page numbers and text.""" | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(pdf_reader.pages)): | |
text = pdf_reader.pages[page_num].extract_text() | |
self.pages[page_num] = text | |
return self.pages | |
def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: | |
"""Split text into chunks of specified size.""" | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_size = 0 | |
for word in words: | |
current_size += len(word) + 1 # +1 for space | |
if current_size > chunk_size: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [word] | |
current_size = len(word) | |
else: | |
current_chunk.append(word) | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |