SOAPAssistV00

Runtime error

Duplicate from AbeerTrial/SOAPAssist

35b22df over 1 year ago

1.63 kB

	"""Docs parser.

	Contains parsers for docx, pdf files.

	"""
	from pathlib import Path
	from typing import Dict

	from gpt_index.readers.file.base_parser import BaseParser


	class PDFParser(BaseParser):
	"""PDF parser."""

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> str:
	"""Parse file."""
	try:
	import PyPDF2
	except ImportError:
	raise ImportError(
	"PyPDF2 is required to read PDF files: `pip install PyPDF2`"
	)
	text_list = []
	with open(file, "rb") as fp:
	# Create a PDF object
	pdf = PyPDF2.PdfReader(fp)

	# Get the number of pages in the PDF document
	num_pages = len(pdf.pages)

	# Iterate over every page
	for page in range(num_pages):
	# Extract the text from the page
	page_text = pdf.pages[page].extract_text()
	text_list.append(page_text)
	text = "\n".join(text_list)

	return text


	class DocxParser(BaseParser):
	"""Docx parser."""

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> str:
	"""Parse file."""
	try:
	import docx2txt
	except ImportError:
	raise ImportError(
	"docx2txt is required to read Microsoft Word files: "
	"`pip install docx2txt`"
	)

	text = docx2txt.process(file)

	return text