fastapi-rag-react / backend /text_utils.py
philipk22's picture
initial deploy
8066b54
from typing import List
import PyPDF2
import io
class CharacterTextSplitter:
def __init__(self, chunk_size=1000, chunk_overlap=200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def split_texts(self, documents: List[str]) -> List[str]:
texts = []
for doc in documents:
if not doc.strip():
continue
# Split the text into chunks
start = 0
while start < len(doc):
end = start + self.chunk_size
chunk = doc[start:end]
texts.append(chunk)
start = end - self.chunk_overlap
return texts
class TextFileLoader:
def __init__(self, file_path: str):
self.file_path = file_path
def load_documents(self) -> List[str]:
with open(self.file_path, 'r', encoding='utf-8') as file:
return [file.read()]
class PDFLoader:
def __init__(self, file_path: str):
self.file_path = file_path
def load_documents(self) -> List[str]:
with open(self.file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return [text]