SOAPAssistV00

Runtime error

App Files Files Community

SOAPAssistV00 / gpt_index /readers /web.py

AbeerTrial

Duplicate from AbeerTrial/SOAPAssist

35b22df over 1 year ago

raw

history blame contribute delete

7.95 kB

	"""Web scraper."""
	import logging
	from typing import Any, Callable, Dict, List, Optional, Tuple

	import requests

	from gpt_index.readers.base import BaseReader
	from gpt_index.readers.schema.base import Document


	class SimpleWebPageReader(BaseReader):
	"""Simple web page reader.

	Reads pages from the web.

	Args:
	html_to_text (bool): Whether to convert HTML to text.
	Requires `html2text` package.

	"""

	def __init__(self, html_to_text: bool = False) -> None:
	"""Initialize with parameters."""
	try:
	import html2text # noqa: F401
	except ImportError:
	raise ImportError(
	"`html2text` package not found, please run `pip install html2text`"
	)
	self._html_to_text = html_to_text

	def load_data(self, urls: List[str]) -> List[Document]:
	"""Load data from the input directory.

	Args:
	urls (List[str]): List of URLs to scrape.

	Returns:
	List[Document]: List of documents.

	"""
	if not isinstance(urls, list):
	raise ValueError("urls must be a list of strings.")
	documents = []
	for url in urls:
	response = requests.get(url, headers=None).text
	if self._html_to_text:
	import html2text

	response = html2text.html2text(response)

	documents.append(Document(response))

	return documents


	class TrafilaturaWebReader(BaseReader):
	"""Trafilatura web page reader.

	Reads pages from the web.
	Requires the `trafilatura` package.

	"""

	def __init__(self, error_on_missing: bool = False) -> None:
	"""Initialize with parameters.

	Args:
	error_on_missing (bool): Throw an error when data cannot be parsed
	"""
	self.error_on_missing = error_on_missing
	try:
	import trafilatura # noqa: F401
	except ImportError:
	raise ImportError(
	"`trafilatura` package not found, please run `pip install trafilatura`"
	)

	def load_data(self, urls: List[str]) -> List[Document]:
	"""Load data from the urls.

	Args:
	urls (List[str]): List of URLs to scrape.

	Returns:
	List[Document]: List of documents.

	"""
	import trafilatura

	if not isinstance(urls, list):
	raise ValueError("urls must be a list of strings.")
	documents = []
	for url in urls:
	downloaded = trafilatura.fetch_url(url)
	if not downloaded:
	if self.error_on_missing:
	raise ValueError(f"Trafilatura fails to get string from url: {url}")
	continue
	response = trafilatura.extract(downloaded)
	if not response:
	if self.error_on_missing:
	raise ValueError(f"Trafilatura fails to parse page: {url}")
	continue
	documents.append(Document(response))

	return documents


	def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]:
	"""Extract text from Substack blog post."""
	extra_info = {
	"Title of this Substack post": soup.select_one("h1.post-title").getText(),
	"Subtitle": soup.select_one("h3.subtitle").getText(),
	"Author": soup.select_one("span.byline-names").getText(),
	}
	text = soup.select_one("div.available-content").getText()
	return text, extra_info


	DEFAULT_WEBSITE_EXTRACTOR: Dict[str, Callable[[Any], Tuple[str, Dict[str, Any]]]] = {
	"substack.com": _substack_reader,
	}


	class BeautifulSoupWebReader(BaseReader):
	"""BeautifulSoup web page reader.

	Reads pages from the web.
	Requires the `bs4` and `urllib` packages.

	Args:
	file_extractor (Optional[Dict[str, Callable]]): A mapping of website
	hostname (e.g. google.com) to a function that specifies how to
	extract text from the BeautifulSoup obj. See DEFAULT_WEBSITE_EXTRACTOR.
	"""

	def __init__(
	self,
	website_extractor: Optional[Dict[str, Callable]] = None,
	) -> None:
	"""Initialize with parameters."""
	try:
	from urllib.parse import urlparse # noqa: F401

	import requests # noqa: F401
	from bs4 import BeautifulSoup # noqa: F401
	except ImportError:
	raise ImportError(
	"`bs4`, `requests`, and `urllib` must be installed to scrape websites."
	"Please run `pip install bs4 requests urllib`."
	)

	self.website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR

	def load_data(
	self, urls: List[str], custom_hostname: Optional[str] = None
	) -> List[Document]:
	"""Load data from the urls.

	Args:
	urls (List[str]): List of URLs to scrape.
	custom_hostname (Optional[str]): Force a certain hostname in the case
	a website is displayed under custom URLs (e.g. Substack blogs)

	Returns:
	List[Document]: List of documents.

	"""
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup

	documents = []
	for url in urls:
	try:
	page = requests.get(url)
	except Exception:
	raise ValueError(f"One of the inputs is not a valid url: {url}")

	hostname = custom_hostname or urlparse(url).hostname or ""

	soup = BeautifulSoup(page.content, "html.parser")

	data = ""
	extra_info = {"URL": url}
	if hostname in self.website_extractor:
	data, metadata = self.website_extractor[hostname](soup)
	extra_info.update(metadata)
	else:
	data = soup.getText()

	documents.append(Document(data, extra_info=extra_info))

	return documents


	class RssReader(BaseReader):
	"""RSS reader.

	Reads content from an RSS feed.

	"""

	def __init__(self, html_to_text: bool = False) -> None:
	"""Initialize with parameters.

	Args:
	html_to_text (bool): Whether to convert HTML to text.
	Requires `html2text` package.

	"""
	try:
	import feedparser # noqa: F401
	except ImportError:
	raise ImportError(
	"`feedparser` package not found, please run `pip install feedparser`"
	)

	if html_to_text:
	try:
	import html2text # noqa: F401
	except ImportError:
	raise ImportError(
	"`html2text` package not found, please run `pip install html2text`"
	)
	self._html_to_text = html_to_text

	def load_data(self, urls: List[str]) -> List[Document]:
	"""Load data from RSS feeds.

	Args:
	urls (List[str]): List of RSS URLs to load.

	Returns:
	List[Document]: List of documents.

	"""
	import feedparser

	if not isinstance(urls, list):
	raise ValueError("urls must be a list of strings.")

	documents = []

	for url in urls:
	parsed = feedparser.parse(url)
	for entry in parsed.entries:
	if "content" in entry:
	data = entry.content[0].value
	else:
	data = entry.description or entry.summary

	if self._html_to_text:
	import html2text

	data = html2text.html2text(data)

	extra_info = {"title": entry.title, "link": entry.link}
	documents.append(Document(data, extra_info=extra_info))

	return documents


	if __name__ == "__main__":
	reader = SimpleWebPageReader()
	logging.info(reader.load_data(["http://www.google.com"]))