Spaces:
Running
Running
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
import requests | |
from ..utils import get_relevant_images, extract_title | |
class WebBaseLoaderScraper: | |
def __init__(self, link, session=None): | |
self.link = link | |
self.session = session or requests.Session() | |
def scrape(self) -> tuple: | |
""" | |
This Python function scrapes content from a webpage using a WebBaseLoader object and returns the | |
concatenated page content. | |
Returns: | |
The `scrape` method is returning a string variable named `content` which contains the | |
concatenated page content from the documents loaded by the `WebBaseLoader`. If an exception | |
occurs during the process, an error message is printed and an empty string is returned. | |
""" | |
try: | |
from langchain_community.document_loaders import WebBaseLoader | |
loader = WebBaseLoader(self.link) | |
loader.requests_kwargs = {"verify": False} | |
docs = loader.load() | |
content = "" | |
for doc in docs: | |
content += doc.page_content | |
response = self.session.get(self.link) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
image_urls = get_relevant_images(soup, self.link) | |
# Extract the title using the utility function | |
title = extract_title(soup) | |
return content, image_urls, title | |
except Exception as e: | |
print("Error! : " + str(e)) | |
return "", [], "" | |