Spaces:
Sleeping
Sleeping
File size: 1,142 Bytes
0753d2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import PyPDF2
from typing import List, Dict
class PDFProcessor:
def __init__(self):
self.pages = {}
def extract_text(self, pdf_file) -> Dict[int, str]:
"""Extract text from PDF and return a dictionary of page numbers and text."""
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
text = pdf_reader.pages[page_num].extract_text()
self.pages[page_num] = text
return self.pages
def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
"""Split text into chunks of specified size."""
words = text.split()
chunks = []
current_chunk = []
current_size = 0
for word in words:
current_size += len(word) + 1 # +1 for space
if current_size > chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_size = len(word)
else:
current_chunk.append(word)
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
|