Shreyas094's picture
Upload 528 files
372531f verified
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
from ..utils import get_relevant_images, extract_title
class WebBaseLoaderScraper:
def __init__(self, link, session=None):
self.link = link
self.session = session or requests.Session()
def scrape(self) -> tuple:
"""
This Python function scrapes content from a webpage using a WebBaseLoader object and returns the
concatenated page content.
Returns:
The `scrape` method is returning a string variable named `content` which contains the
concatenated page content from the documents loaded by the `WebBaseLoader`. If an exception
occurs during the process, an error message is printed and an empty string is returned.
"""
try:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(self.link)
loader.requests_kwargs = {"verify": False}
docs = loader.load()
content = ""
for doc in docs:
content += doc.page_content
response = self.session.get(self.link)
soup = BeautifulSoup(response.content, 'html.parser')
image_urls = get_relevant_images(soup, self.link)
# Extract the title using the utility function
title = extract_title(soup)
return content, image_urls, title
except Exception as e:
print("Error! : " + str(e))
return "", [], ""