Shreyas094's picture
Upload 528 files
372531f verified
from concurrent.futures.thread import ThreadPoolExecutor
from functools import partial
from colorama import Fore, init
import requests
import subprocess
import sys
import importlib
from . import (
ArxivScraper,
BeautifulSoupScraper,
PyMuPDFScraper,
WebBaseLoaderScraper,
BrowserScraper,
TavilyExtract
)
class Scraper:
"""
Scraper class to extract the content from the links
"""
def __init__(self, urls, user_agent, scraper):
"""
Initialize the Scraper class.
Args:
urls:
"""
self.urls = urls
self.session = requests.Session()
self.session.headers.update({"User-Agent": user_agent})
self.scraper = scraper
if self.scraper == "tavily_extract":
self._check_pkg(self.scraper)
def run(self):
"""
Extracts the content from the links
"""
partial_extract = partial(self.extract_data_from_url, session=self.session)
with ThreadPoolExecutor(max_workers=20) as executor:
contents = executor.map(partial_extract, self.urls)
res = [content for content in contents if content["raw_content"] is not None]
return res
def _check_pkg(self, scrapper_name : str) -> None:
"""
Checks and ensures required Python packages are available for scrapers that need
dependencies beyond requirements.txt. When adding a new scraper to the repo, update `pkg_map`
with its required information and call check_pkg() during initialization.
"""
pkg_map = {
"tavily_extract": {"package_installation_name": "tavily-python",
"import_name": "tavily"},
}
pkg = pkg_map[scrapper_name]
if not importlib.util.find_spec(pkg["import_name"]):
pkg_inst_name = pkg["package_installation_name"]
init(autoreset=True)
print(Fore.YELLOW + f"{pkg_inst_name} not found. Attempting to install...")
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_inst_name])
print(Fore.GREEN + f"{pkg_inst_name} installed successfully.")
except subprocess.CalledProcessError:
raise ImportError(
Fore.RED + f"Unable to install {pkg_inst_name}. Please install manually with "
f"`pip install -U {pkg_inst_name}`"
)
def extract_data_from_url(self, link, session):
"""
Extracts the data from the link
"""
try:
Scraper = self.get_scraper(link)
scraper = Scraper(link, session)
content, image_urls, title = scraper.scrape()
if len(content) < 100:
return {"url": link, "raw_content": None, "image_urls": [], "title": ""}
return {"url": link, "raw_content": content, "image_urls": image_urls, "title": title}
except Exception as e:
return {"url": link, "raw_content": None, "image_urls": [], "title": ""}
def get_scraper(self, link):
"""
The function `get_scraper` determines the appropriate scraper class based on the provided link
or a default scraper if none matches.
Args:
link: The `get_scraper` method takes a `link` parameter which is a URL link to a webpage or a
PDF file. Based on the type of content the link points to, the method determines the appropriate
scraper class to use for extracting data from that content.
Returns:
The `get_scraper` method returns the scraper class based on the provided link. The method
checks the link to determine the appropriate scraper class to use based on predefined mappings
in the `SCRAPER_CLASSES` dictionary. If the link ends with ".pdf", it selects the
`PyMuPDFScraper` class. If the link contains "arxiv.org", it selects the `ArxivScraper
"""
SCRAPER_CLASSES = {
"pdf": PyMuPDFScraper,
"arxiv": ArxivScraper,
"bs": BeautifulSoupScraper,
"web_base_loader": WebBaseLoaderScraper,
"browser": BrowserScraper,
"tavily_extract": TavilyExtract
}
scraper_key = None
if link.endswith(".pdf"):
scraper_key = "pdf"
elif "arxiv.org" in link:
scraper_key = "arxiv"
else:
scraper_key = self.scraper
scraper_class = SCRAPER_CLASSES.get(scraper_key)
if scraper_class is None:
raise Exception("Scraper not found.")
return scraper_class