Spaces:

Shreyas094
/

GPT-Researcher

Running

App Files Files Community

GPT-Researcher / gpt_researcher /scraper /pymupdf /pymupdf.py

Shreyas094

Upload 528 files

372531f verified about 2 months ago

raw

history blame contribute delete

2.29 kB

	import os
	import requests
	import tempfile
	from urllib.parse import urlparse
	from langchain_community.document_loaders import PyMuPDFLoader


	class PyMuPDFScraper:

	def __init__(self, link, session=None):
	"""
	Initialize the scraper with a link and an optional session.

	Args:
	link (str): The URL or local file path of the PDF document.
	session (requests.Session, optional): An optional session for making HTTP requests.
	"""
	self.link = link
	self.session = session

	def is_url(self) -> bool:
	"""
	Check if the provided `link` is a valid URL.

	Returns:
	bool: True if the link is a valid URL, False otherwise.
	"""
	try:
	result = urlparse(self.link)
	return all([result.scheme, result.netloc]) # Check for valid scheme and network location
	except Exception:
	return False

	def scrape(self) -> str:
	"""
	The `scrape` function uses PyMuPDFLoader to load a document from the provided link (either URL or local file)
	and returns the document as a string.

	Returns:
	str: A string representation of the loaded document.
	"""
	try:
	if self.is_url():
	response = requests.get(self.link, timeout=5, stream=True)
	response.raise_for_status()

	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_filename = temp_file.name # Get the temporary file name
	for chunk in response.iter_content(chunk_size=8192):
	temp_file.write(chunk) # Write the downloaded content to the temporary file

	loader = PyMuPDFLoader(temp_filename)
	doc = loader.load()

	os.remove(temp_filename)
	else:
	loader = PyMuPDFLoader(self.link)
	doc = loader.load()

	return str(doc)

	except requests.exceptions.Timeout:
	print(f"Download timed out. Please check the link : {self.link}")
	except Exception as e:
	print(f"Error loading PDF : {self.link} {e}")