Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader | |
| import os | |
| from typing import List | |
| class PDFProcessor: | |
| """ | |
| Class for processing PDF files to extract text content. | |
| """ | |
| def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]: | |
| """ | |
| Extract text content from a list of PDF files. | |
| Args: | |
| file_paths (List[str]): A list of file paths to the PDF documents. | |
| Returns: | |
| List[str]: A list of text content extracted from the PDF documents. | |
| """ | |
| texts = [] | |
| for file_path in file_paths: | |
| try: | |
| loader = PyPDFLoader(file_path) | |
| pages = loader.load_and_split() | |
| for page in pages: | |
| if isinstance(page.page_content, bytes): | |
| text = page.page_content.decode('utf-8', errors='ignore') | |
| elif isinstance(page.page_content, str): | |
| text = page.page_content | |
| else: | |
| print(f"Unexpected type: {type(page.page_content)}") | |
| continue | |
| texts.append(text) | |
| except Exception as e: | |
| print(f"Failed to process {file_path}: {e}") | |
| return texts |