|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from abc import ABC |
|
import asyncio |
|
from crawl4ai import AsyncWebCrawler |
|
from agent.component.base import ComponentBase, ComponentParamBase |
|
from api.utils.web_utils import is_valid_url |
|
|
|
|
|
class CrawlerParam(ComponentParamBase): |
|
""" |
|
Define the Crawler component parameters. |
|
""" |
|
|
|
def __init__(self): |
|
super().__init__() |
|
self.proxy = None |
|
self.extract_type = "markdown" |
|
|
|
def check(self): |
|
self.check_valid_value(self.extract_type, "Type of content from the crawler", ['html', 'markdown', 'content']) |
|
|
|
|
|
class Crawler(ComponentBase, ABC): |
|
component_name = "Crawler" |
|
|
|
def _run(self, history, **kwargs): |
|
ans = self.get_input() |
|
ans = " - ".join(ans["content"]) if "content" in ans else "" |
|
if not is_valid_url(ans): |
|
return Crawler.be_output("URL not valid") |
|
try: |
|
result = asyncio.run(self.get_web(ans)) |
|
|
|
return Crawler.be_output(result) |
|
|
|
except Exception as e: |
|
return Crawler.be_output(f"An unexpected error occurred: {str(e)}") |
|
|
|
async def get_web(self, url): |
|
proxy = self._param.proxy if self._param.proxy else None |
|
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler: |
|
result = await crawler.arun( |
|
url=url, |
|
bypass_cache=True |
|
) |
|
|
|
if self._param.extract_type == 'html': |
|
return result.cleaned_html |
|
elif self._param.extract_type == 'markdown': |
|
return result.markdown |
|
elif self._param.extract_type == 'content': |
|
result.extracted_content |
|
return result.markdown |
|
|