Spaces:

Shreyas094
/

GPT-Researcher

Running

App Files Files Community

GPT-Researcher / gpt_researcher /scraper /browser /browser.py

Shreyas094

Upload 528 files

372531f verified about 2 months ago

raw

history blame contribute delete

11.3 kB

	from __future__ import annotations

	import traceback
	import pickle
	from pathlib import Path
	from sys import platform
	import time
	import random
	import string
	import os

	from bs4 import BeautifulSoup

	from .processing.scrape_skills import (scrape_pdf_with_pymupdf,
	scrape_pdf_with_arxiv)

	from urllib.parse import urljoin

	FILE_DIR = Path(__file__).parent.parent

	from ..utils import get_relevant_images, extract_title


	class BrowserScraper:
	def __init__(self, url: str, session=None):
	self.url = url
	self.session = session
	self.selenium_web_browser = "chrome"
	self.headless = False
	self.user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/128.0.0.0 Safari/537.36")
	self.driver = None
	self.use_browser_cookies = False
	self._import_selenium() # Import only if used to avoid unnecessary dependencies
	self.cookie_filename = f"{self._generate_random_string(8)}.pkl"

	def scrape(self) -> tuple:
	if not self.url:
	print("URL not specified")
	return "A URL was not specified, cancelling request to browse website.", [], ""

	try:
	self.setup_driver()
	self._visit_google_and_save_cookies()
	self._load_saved_cookies()
	self._add_header()

	text, image_urls, title = self.scrape_text_with_selenium()
	return text, image_urls, title
	except Exception as e:
	print(f"An error occurred during scraping: {str(e)}")
	print("Full stack trace:")
	print(traceback.format_exc())
	return f"An error occurred: {str(e)}\n\nStack trace:\n{traceback.format_exc()}", [], ""
	finally:
	if self.driver:
	self.driver.quit()
	self._cleanup_cookie_file()

	def _import_selenium(self):
	try:
	global webdriver, By, EC, WebDriverWait, TimeoutException, WebDriverException
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.wait import WebDriverWait
	from selenium.common.exceptions import TimeoutException, WebDriverException

	global ChromeOptions, FirefoxOptions, SafariOptions
	from selenium.webdriver.chrome.options import Options as ChromeOptions
	from selenium.webdriver.firefox.options import Options as FirefoxOptions
	from selenium.webdriver.safari.options import Options as SafariOptions
	except ImportError as e:
	print(f"Failed to import Selenium: {str(e)}")
	print("Please install Selenium and its dependencies to use BrowserScraper.")
	print("You can install Selenium using pip:")
	print(" pip install selenium")
	print("If you're using a virtual environment, make sure it's activated.")
	raise ImportError(
	"Selenium is required but not installed. See error message above for installation instructions.") from e

	def setup_driver(self) -> None:
	# print(f"Setting up {self.selenium_web_browser} driver...")

	options_available = {
	"chrome": ChromeOptions,
	"firefox": FirefoxOptions,
	"safari": SafariOptions,
	}

	options = options_available[self.selenium_web_browser]()
	options.add_argument(f"user-agent={self.user_agent}")
	if self.headless:
	options.add_argument("--headless")
	options.add_argument("--enable-javascript")

	try:
	if self.selenium_web_browser == "firefox":
	self.driver = webdriver.Firefox(options=options)
	elif self.selenium_web_browser == "safari":
	self.driver = webdriver.Safari(options=options)
	else: # chrome
	if platform == "linux" or platform == "linux2":
	options.add_argument("--disable-dev-shm-usage")
	options.add_argument("--remote-debugging-port=9222")
	options.add_argument("--no-sandbox")
	options.add_experimental_option("prefs", {"download_restrictions": 3})
	self.driver = webdriver.Chrome(options=options)

	if self.use_browser_cookies:
	self._load_browser_cookies()

	# print(f"{self.selenium_web_browser.capitalize()} driver set up successfully.")
	except Exception as e:
	print(f"Failed to set up {self.selenium_web_browser} driver: {str(e)}")
	print("Full stack trace:")
	print(traceback.format_exc())
	raise

	def _load_saved_cookies(self):
	"""Load saved cookies before visiting the target URL"""
	cookie_file = Path(self.cookie_filename)
	if cookie_file.exists():
	cookies = pickle.load(open(self.cookie_filename, "rb"))
	for cookie in cookies:
	self.driver.add_cookie(cookie)
	else:
	print("No saved cookies found.")

	def _load_browser_cookies(self):
	"""Load cookies directly from the browser"""
	try:
	import browser_cookie3
	except ImportError:
	print("browser_cookie3 is not installed. Please install it using: pip install browser_cookie3")
	return

	if self.selenium_web_browser == "chrome":
	cookies = browser_cookie3.chrome()
	elif self.selenium_web_browser == "firefox":
	cookies = browser_cookie3.firefox()
	else:
	print(f"Cookie loading not supported for {self.selenium_web_browser}")
	return

	for cookie in cookies:
	self.driver.add_cookie({'name': cookie.name, 'value': cookie.value, 'domain': cookie.domain})

	def _cleanup_cookie_file(self):
	"""Remove the cookie file"""
	cookie_file = Path(self.cookie_filename)
	if cookie_file.exists():
	try:
	os.remove(self.cookie_filename)
	except Exception as e:
	print(f"Failed to remove cookie file: {str(e)}")
	else:
	print("No cookie file found to remove.")

	def _generate_random_string(self, length):
	"""Generate a random string of specified length"""
	return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

	def _get_domain(self):
	"""Extract domain from URL"""
	from urllib.parse import urlparse
	"""Get domain from URL, removing 'www' if present"""
	domain = urlparse(self.url).netloc
	return domain[4:] if domain.startswith('www.') else domain

	def _visit_google_and_save_cookies(self):
	"""Visit Google and save cookies before navigating to the target URL"""
	try:
	self.driver.get("https://www.google.com")
	time.sleep(2) # Wait for cookies to be set

	# Save cookies to a file
	cookies = self.driver.get_cookies()
	pickle.dump(cookies, open(self.cookie_filename, "wb"))

	# print("Google cookies saved successfully.")
	except Exception as e:
	print(f"Failed to visit Google and save cookies: {str(e)}")
	print("Full stack trace:")
	print(traceback.format_exc())

	def scrape_text_with_selenium(self) -> tuple:
	self.driver.get(self.url)

	try:
	WebDriverWait(self.driver, 20).until(
	EC.presence_of_element_located((By.TAG_NAME, "body"))
	)
	except TimeoutException as e:
	print("Timed out waiting for page to load")
	print(f"Full stack trace:\n{traceback.format_exc()}")
	return "Page load timed out", [], ""

	self._scroll_to_bottom()

	if self.url.endswith(".pdf"):
	text = scrape_pdf_with_pymupdf(self.url)
	return text, [], ""
	elif "arxiv" in self.url:
	doc_num = self.url.split("/")[-1]
	text = scrape_pdf_with_arxiv(doc_num)
	return text, [], ""
	else:
	page_source = self.driver.execute_script("return document.body.outerHTML;")
	soup = BeautifulSoup(page_source, "html.parser")

	for script in soup(["script", "style"]):
	script.extract()

	text = self.get_text(soup)
	image_urls = get_relevant_images(soup, self.url)
	title = extract_title(soup)

	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = "\n".join(chunk for chunk in chunks if chunk)
	return text, image_urls, title

	def get_text(self, soup: BeautifulSoup) -> str:
	"""Get the relevant text from the soup with improved filtering"""
	text_elements = []
	tags = ["h1", "h2", "h3", "h4", "h5", "p", "li", "div", "span"]

	for element in soup.find_all(tags):
	# Skip empty elements
	if not element.text.strip():
	continue

	# Skip elements with very short text (likely buttons or links)
	if len(element.text.split()) < 3:
	continue

	# Check if the element is likely to be navigation or a menu
	parent_classes = element.parent.get('class', [])
	if any(cls in ['nav', 'menu', 'sidebar', 'footer'] for cls in parent_classes):
	continue

	# Remove excess whitespace and join lines
	cleaned_text = ' '.join(element.text.split())

	# Add the cleaned text to our list of elements
	text_elements.append(cleaned_text)

	# Join all text elements with newlines
	return '\n\n'.join(text_elements)

	def _scroll_to_bottom(self):
	"""Scroll to the bottom of the page to load all content"""
	last_height = self.driver.execute_script("return document.body.scrollHeight")
	while True:
	self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(2) # Wait for content to load
	new_height = self.driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height

	def _scroll_to_percentage(self, ratio: float) -> None:
	"""Scroll to a percentage of the page"""
	if ratio < 0 or ratio > 1:
	raise ValueError("Percentage should be between 0 and 1")
	self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {ratio});")

	def _add_header(self) -> None:
	"""Add a header to the website"""
	self.driver.execute_script(open(f"{FILE_DIR}/browser/js/overlay.js", "r").read())