from bs4 import BeautifulSoup from urllib.parse import urljoin import requests from ..utils import get_relevant_images, extract_title class WebBaseLoaderScraper: def __init__(self, link, session=None): self.link = link self.session = session or requests.Session() def scrape(self) -> tuple: """ This Python function scrapes content from a webpage using a WebBaseLoader object and returns the concatenated page content. Returns: The `scrape` method is returning a string variable named `content` which contains the concatenated page content from the documents loaded by the `WebBaseLoader`. If an exception occurs during the process, an error message is printed and an empty string is returned. """ try: from langchain_community.document_loaders import WebBaseLoader loader = WebBaseLoader(self.link) loader.requests_kwargs = {"verify": False} docs = loader.load() content = "" for doc in docs: content += doc.page_content response = self.session.get(self.link) soup = BeautifulSoup(response.content, 'html.parser') image_urls = get_relevant_images(soup, self.link) # Extract the title using the utility function title = extract_title(soup) return content, image_urls, title except Exception as e: print("Error! : " + str(e)) return "", [], ""