Shreyas094's picture
Upload 528 files
372531f verified
from __future__ import annotations
import traceback
import pickle
from pathlib import Path
from sys import platform
import time
import random
import string
import os
from bs4 import BeautifulSoup
from .processing.scrape_skills import (scrape_pdf_with_pymupdf,
scrape_pdf_with_arxiv)
from urllib.parse import urljoin
FILE_DIR = Path(__file__).parent.parent
from ..utils import get_relevant_images, extract_title
class BrowserScraper:
def __init__(self, url: str, session=None):
self.url = url
self.session = session
self.selenium_web_browser = "chrome"
self.headless = False
self.user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/128.0.0.0 Safari/537.36")
self.driver = None
self.use_browser_cookies = False
self._import_selenium() # Import only if used to avoid unnecessary dependencies
self.cookie_filename = f"{self._generate_random_string(8)}.pkl"
def scrape(self) -> tuple:
if not self.url:
print("URL not specified")
return "A URL was not specified, cancelling request to browse website.", [], ""
try:
self.setup_driver()
self._visit_google_and_save_cookies()
self._load_saved_cookies()
self._add_header()
text, image_urls, title = self.scrape_text_with_selenium()
return text, image_urls, title
except Exception as e:
print(f"An error occurred during scraping: {str(e)}")
print("Full stack trace:")
print(traceback.format_exc())
return f"An error occurred: {str(e)}\n\nStack trace:\n{traceback.format_exc()}", [], ""
finally:
if self.driver:
self.driver.quit()
self._cleanup_cookie_file()
def _import_selenium(self):
try:
global webdriver, By, EC, WebDriverWait, TimeoutException, WebDriverException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException
global ChromeOptions, FirefoxOptions, SafariOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.safari.options import Options as SafariOptions
except ImportError as e:
print(f"Failed to import Selenium: {str(e)}")
print("Please install Selenium and its dependencies to use BrowserScraper.")
print("You can install Selenium using pip:")
print(" pip install selenium")
print("If you're using a virtual environment, make sure it's activated.")
raise ImportError(
"Selenium is required but not installed. See error message above for installation instructions.") from e
def setup_driver(self) -> None:
# print(f"Setting up {self.selenium_web_browser} driver...")
options_available = {
"chrome": ChromeOptions,
"firefox": FirefoxOptions,
"safari": SafariOptions,
}
options = options_available[self.selenium_web_browser]()
options.add_argument(f"user-agent={self.user_agent}")
if self.headless:
options.add_argument("--headless")
options.add_argument("--enable-javascript")
try:
if self.selenium_web_browser == "firefox":
self.driver = webdriver.Firefox(options=options)
elif self.selenium_web_browser == "safari":
self.driver = webdriver.Safari(options=options)
else: # chrome
if platform == "linux" or platform == "linux2":
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--remote-debugging-port=9222")
options.add_argument("--no-sandbox")
options.add_experimental_option("prefs", {"download_restrictions": 3})
self.driver = webdriver.Chrome(options=options)
if self.use_browser_cookies:
self._load_browser_cookies()
# print(f"{self.selenium_web_browser.capitalize()} driver set up successfully.")
except Exception as e:
print(f"Failed to set up {self.selenium_web_browser} driver: {str(e)}")
print("Full stack trace:")
print(traceback.format_exc())
raise
def _load_saved_cookies(self):
"""Load saved cookies before visiting the target URL"""
cookie_file = Path(self.cookie_filename)
if cookie_file.exists():
cookies = pickle.load(open(self.cookie_filename, "rb"))
for cookie in cookies:
self.driver.add_cookie(cookie)
else:
print("No saved cookies found.")
def _load_browser_cookies(self):
"""Load cookies directly from the browser"""
try:
import browser_cookie3
except ImportError:
print("browser_cookie3 is not installed. Please install it using: pip install browser_cookie3")
return
if self.selenium_web_browser == "chrome":
cookies = browser_cookie3.chrome()
elif self.selenium_web_browser == "firefox":
cookies = browser_cookie3.firefox()
else:
print(f"Cookie loading not supported for {self.selenium_web_browser}")
return
for cookie in cookies:
self.driver.add_cookie({'name': cookie.name, 'value': cookie.value, 'domain': cookie.domain})
def _cleanup_cookie_file(self):
"""Remove the cookie file"""
cookie_file = Path(self.cookie_filename)
if cookie_file.exists():
try:
os.remove(self.cookie_filename)
except Exception as e:
print(f"Failed to remove cookie file: {str(e)}")
else:
print("No cookie file found to remove.")
def _generate_random_string(self, length):
"""Generate a random string of specified length"""
return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
def _get_domain(self):
"""Extract domain from URL"""
from urllib.parse import urlparse
"""Get domain from URL, removing 'www' if present"""
domain = urlparse(self.url).netloc
return domain[4:] if domain.startswith('www.') else domain
def _visit_google_and_save_cookies(self):
"""Visit Google and save cookies before navigating to the target URL"""
try:
self.driver.get("https://www.google.com")
time.sleep(2) # Wait for cookies to be set
# Save cookies to a file
cookies = self.driver.get_cookies()
pickle.dump(cookies, open(self.cookie_filename, "wb"))
# print("Google cookies saved successfully.")
except Exception as e:
print(f"Failed to visit Google and save cookies: {str(e)}")
print("Full stack trace:")
print(traceback.format_exc())
def scrape_text_with_selenium(self) -> tuple:
self.driver.get(self.url)
try:
WebDriverWait(self.driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
except TimeoutException as e:
print("Timed out waiting for page to load")
print(f"Full stack trace:\n{traceback.format_exc()}")
return "Page load timed out", [], ""
self._scroll_to_bottom()
if self.url.endswith(".pdf"):
text = scrape_pdf_with_pymupdf(self.url)
return text, [], ""
elif "arxiv" in self.url:
doc_num = self.url.split("/")[-1]
text = scrape_pdf_with_arxiv(doc_num)
return text, [], ""
else:
page_source = self.driver.execute_script("return document.body.outerHTML;")
soup = BeautifulSoup(page_source, "html.parser")
for script in soup(["script", "style"]):
script.extract()
text = self.get_text(soup)
image_urls = get_relevant_images(soup, self.url)
title = extract_title(soup)
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return text, image_urls, title
def get_text(self, soup: BeautifulSoup) -> str:
"""Get the relevant text from the soup with improved filtering"""
text_elements = []
tags = ["h1", "h2", "h3", "h4", "h5", "p", "li", "div", "span"]
for element in soup.find_all(tags):
# Skip empty elements
if not element.text.strip():
continue
# Skip elements with very short text (likely buttons or links)
if len(element.text.split()) < 3:
continue
# Check if the element is likely to be navigation or a menu
parent_classes = element.parent.get('class', [])
if any(cls in ['nav', 'menu', 'sidebar', 'footer'] for cls in parent_classes):
continue
# Remove excess whitespace and join lines
cleaned_text = ' '.join(element.text.split())
# Add the cleaned text to our list of elements
text_elements.append(cleaned_text)
# Join all text elements with newlines
return '\n\n'.join(text_elements)
def _scroll_to_bottom(self):
"""Scroll to the bottom of the page to load all content"""
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # Wait for content to load
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def _scroll_to_percentage(self, ratio: float) -> None:
"""Scroll to a percentage of the page"""
if ratio < 0 or ratio > 1:
raise ValueError("Percentage should be between 0 and 1")
self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {ratio});")
def _add_header(self) -> None:
"""Add a header to the website"""
self.driver.execute_script(open(f"{FILE_DIR}/browser/js/overlay.js", "r").read())