AbeerTrial's picture
Duplicate from AbeerTrial/SOAPAssist
35b22df
"""Web scraper."""
import logging
from typing import Any, Callable, Dict, List, Optional, Tuple
import requests
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class SimpleWebPageReader(BaseReader):
"""Simple web page reader.
Reads pages from the web.
Args:
html_to_text (bool): Whether to convert HTML to text.
Requires `html2text` package.
"""
def __init__(self, html_to_text: bool = False) -> None:
"""Initialize with parameters."""
try:
import html2text # noqa: F401
except ImportError:
raise ImportError(
"`html2text` package not found, please run `pip install html2text`"
)
self._html_to_text = html_to_text
def load_data(self, urls: List[str]) -> List[Document]:
"""Load data from the input directory.
Args:
urls (List[str]): List of URLs to scrape.
Returns:
List[Document]: List of documents.
"""
if not isinstance(urls, list):
raise ValueError("urls must be a list of strings.")
documents = []
for url in urls:
response = requests.get(url, headers=None).text
if self._html_to_text:
import html2text
response = html2text.html2text(response)
documents.append(Document(response))
return documents
class TrafilaturaWebReader(BaseReader):
"""Trafilatura web page reader.
Reads pages from the web.
Requires the `trafilatura` package.
"""
def __init__(self, error_on_missing: bool = False) -> None:
"""Initialize with parameters.
Args:
error_on_missing (bool): Throw an error when data cannot be parsed
"""
self.error_on_missing = error_on_missing
try:
import trafilatura # noqa: F401
except ImportError:
raise ImportError(
"`trafilatura` package not found, please run `pip install trafilatura`"
)
def load_data(self, urls: List[str]) -> List[Document]:
"""Load data from the urls.
Args:
urls (List[str]): List of URLs to scrape.
Returns:
List[Document]: List of documents.
"""
import trafilatura
if not isinstance(urls, list):
raise ValueError("urls must be a list of strings.")
documents = []
for url in urls:
downloaded = trafilatura.fetch_url(url)
if not downloaded:
if self.error_on_missing:
raise ValueError(f"Trafilatura fails to get string from url: {url}")
continue
response = trafilatura.extract(downloaded)
if not response:
if self.error_on_missing:
raise ValueError(f"Trafilatura fails to parse page: {url}")
continue
documents.append(Document(response))
return documents
def _substack_reader(soup: Any) -> Tuple[str, Dict[str, Any]]:
"""Extract text from Substack blog post."""
extra_info = {
"Title of this Substack post": soup.select_one("h1.post-title").getText(),
"Subtitle": soup.select_one("h3.subtitle").getText(),
"Author": soup.select_one("span.byline-names").getText(),
}
text = soup.select_one("div.available-content").getText()
return text, extra_info
DEFAULT_WEBSITE_EXTRACTOR: Dict[str, Callable[[Any], Tuple[str, Dict[str, Any]]]] = {
"substack.com": _substack_reader,
}
class BeautifulSoupWebReader(BaseReader):
"""BeautifulSoup web page reader.
Reads pages from the web.
Requires the `bs4` and `urllib` packages.
Args:
file_extractor (Optional[Dict[str, Callable]]): A mapping of website
hostname (e.g. google.com) to a function that specifies how to
extract text from the BeautifulSoup obj. See DEFAULT_WEBSITE_EXTRACTOR.
"""
def __init__(
self,
website_extractor: Optional[Dict[str, Callable]] = None,
) -> None:
"""Initialize with parameters."""
try:
from urllib.parse import urlparse # noqa: F401
import requests # noqa: F401
from bs4 import BeautifulSoup # noqa: F401
except ImportError:
raise ImportError(
"`bs4`, `requests`, and `urllib` must be installed to scrape websites."
"Please run `pip install bs4 requests urllib`."
)
self.website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR
def load_data(
self, urls: List[str], custom_hostname: Optional[str] = None
) -> List[Document]:
"""Load data from the urls.
Args:
urls (List[str]): List of URLs to scrape.
custom_hostname (Optional[str]): Force a certain hostname in the case
a website is displayed under custom URLs (e.g. Substack blogs)
Returns:
List[Document]: List of documents.
"""
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
documents = []
for url in urls:
try:
page = requests.get(url)
except Exception:
raise ValueError(f"One of the inputs is not a valid url: {url}")
hostname = custom_hostname or urlparse(url).hostname or ""
soup = BeautifulSoup(page.content, "html.parser")
data = ""
extra_info = {"URL": url}
if hostname in self.website_extractor:
data, metadata = self.website_extractor[hostname](soup)
extra_info.update(metadata)
else:
data = soup.getText()
documents.append(Document(data, extra_info=extra_info))
return documents
class RssReader(BaseReader):
"""RSS reader.
Reads content from an RSS feed.
"""
def __init__(self, html_to_text: bool = False) -> None:
"""Initialize with parameters.
Args:
html_to_text (bool): Whether to convert HTML to text.
Requires `html2text` package.
"""
try:
import feedparser # noqa: F401
except ImportError:
raise ImportError(
"`feedparser` package not found, please run `pip install feedparser`"
)
if html_to_text:
try:
import html2text # noqa: F401
except ImportError:
raise ImportError(
"`html2text` package not found, please run `pip install html2text`"
)
self._html_to_text = html_to_text
def load_data(self, urls: List[str]) -> List[Document]:
"""Load data from RSS feeds.
Args:
urls (List[str]): List of RSS URLs to load.
Returns:
List[Document]: List of documents.
"""
import feedparser
if not isinstance(urls, list):
raise ValueError("urls must be a list of strings.")
documents = []
for url in urls:
parsed = feedparser.parse(url)
for entry in parsed.entries:
if "content" in entry:
data = entry.content[0].value
else:
data = entry.description or entry.summary
if self._html_to_text:
import html2text
data = html2text.html2text(data)
extra_info = {"title": entry.title, "link": entry.link}
documents.append(Document(data, extra_info=extra_info))
return documents
if __name__ == "__main__":
reader = SimpleWebPageReader()
logging.info(reader.load_data(["http://www.google.com"]))