Spaces:
Runtime error
Runtime error
"""Docs parser. | |
Contains parsers for docx, pdf files. | |
""" | |
from pathlib import Path | |
from typing import Dict | |
from gpt_index.readers.file.base_parser import BaseParser | |
class PDFParser(BaseParser): | |
"""PDF parser.""" | |
def _init_parser(self) -> Dict: | |
"""Init parser.""" | |
return {} | |
def parse_file(self, file: Path, errors: str = "ignore") -> str: | |
"""Parse file.""" | |
try: | |
import PyPDF2 | |
except ImportError: | |
raise ImportError( | |
"PyPDF2 is required to read PDF files: `pip install PyPDF2`" | |
) | |
text_list = [] | |
with open(file, "rb") as fp: | |
# Create a PDF object | |
pdf = PyPDF2.PdfReader(fp) | |
# Get the number of pages in the PDF document | |
num_pages = len(pdf.pages) | |
# Iterate over every page | |
for page in range(num_pages): | |
# Extract the text from the page | |
page_text = pdf.pages[page].extract_text() | |
text_list.append(page_text) | |
text = "\n".join(text_list) | |
return text | |
class DocxParser(BaseParser): | |
"""Docx parser.""" | |
def _init_parser(self) -> Dict: | |
"""Init parser.""" | |
return {} | |
def parse_file(self, file: Path, errors: str = "ignore") -> str: | |
"""Parse file.""" | |
try: | |
import docx2txt | |
except ImportError: | |
raise ImportError( | |
"docx2txt is required to read Microsoft Word files: " | |
"`pip install docx2txt`" | |
) | |
text = docx2txt.process(file) | |
return text | |