Spaces:

Inferno-721
/

Sutra_AI

Sleeping

File size: 1,142 Bytes

0753d2e

import PyPDF2
from typing import List, Dict

class PDFProcessor:
    def __init__(self):
        self.pages = {}

    def extract_text(self, pdf_file) -> Dict[int, str]:
        """Extract text from PDF and return a dictionary of page numbers and text."""
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page_num].extract_text()
            self.pages[page_num] = text
        return self.pages

    def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks of specified size."""
        words = text.split()
        chunks = []
        current_chunk = []
        current_size = 0

        for word in words:
            current_size += len(word) + 1  # +1 for space
            if current_size > chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_size = len(word)
            else:
                current_chunk.append(word)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks