Spaces:
Running
Running
| import asyncio | |
| from playwright.async_api import async_playwright | |
| from playwright_stealth import Stealth | |
| from bs4 import BeautifulSoup | |
| from crewai.tools import BaseTool | |
| class StealthScrapeTool(BaseTool): | |
| name: str = "Stealth Web Scraper" | |
| description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector." | |
| async def _arun(self, website_url: str, css_element = "body") -> str: | |
| try: | |
| async with Stealth().use_async(async_playwright()) as p: | |
| browser = await p.chromium.launch(headless=True) | |
| page = await browser.new_page() | |
| await page.goto(website_url, timeout=120000) | |
| try: | |
| # Wait for the specific element to be present | |
| await page.wait_for_selector(css_element, timeout=30000) | |
| except Exception as e: | |
| # If timeout error, try again with "body" as css_element | |
| if "Timeout" in str(e) and css_element != "body": | |
| await page.wait_for_selector("body", timeout=60000) | |
| css_element = "body" | |
| else: | |
| raise e | |
| html_content = await page.content() | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| target_element = soup.select_one(css_element) | |
| if target_element: | |
| return target_element.prettify() | |
| else: | |
| return f"Error: Could not find element with selector '{css_element}' on the page." | |
| except Exception as e: | |
| return f"Error during stealth web scraping: {e}" | |
| def _run(self, website_url: str, css_element: str) -> str: | |
| # This method is for synchronous execution, which is not ideal for Playwright. | |
| # CrewAI typically calls _arun for async tools. | |
| # For simplicity, we'll just call the async version here. | |
| return asyncio.run(self._arun(website_url, css_element)) | |