File size: 5,705 Bytes
5b4a293
 
 
 
 
 
 
 
 
 
df0eb35
5b4a293
 
 
 
 
df0eb35
 
5b4a293
df0eb35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4a293
 
df0eb35
5b4a293
 
df0eb35
 
 
 
 
 
 
5b4a293
 
df0eb35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4a293
 
 
 
 
 
df0eb35
5b4a293
 
 
df0eb35
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from bs4 import BeautifulSoup
from crewai.tools import BaseTool

class StealthScrapeTool(BaseTool):
    name: str = "Stealth Web Scraper"
    description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector."

    async def _arun(self, website_url: str, css_element = "body", wait_for_selectors: list[str] = None) -> str:
        try:
            async with Stealth().use_async(async_playwright()) as p:
                browser = await p.chromium.launch(headless=True)
                page = await browser.new_page()
                
                print(f"StealthScrapeTool: Starting scraping for {website_url}...")
                print(f"StealthScrapeTool: Navigating to {website_url}")
                await page.goto(website_url, timeout=120000)
                await asyncio.sleep(5)

                # Scroll to the bottom of the page repeatedly to load all dynamic content
                print("StealthScrapeTool: Scrolling through the page to load dynamic content...")
                print("StealthScrapeTool: Getting initial scrollHeight...")
                last_height = await page.evaluate("document.body.scrollHeight")
                print(f"StealthScrapeTool: Initial scrollHeight: {last_height}")
                scroll_attempts = 0
                max_scroll_attempts = 10

                while scroll_attempts < max_scroll_attempts:
                    print(f"StealthScrapeTool: Scroll attempt {scroll_attempts + 1}")
                    print("StealthScrapeTool: Scrolling to bottom...")
                    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                    print("StealthScrapeTool: Scrolled. Waiting for content to load...")

                    await asyncio.sleep(5)

                    print("StealthScrapeTool: Getting new scrollHeight...")
                    new_height = await page.evaluate("document.body.scrollHeight")
                    print(f"StealthScrapeTool: New scrollHeight: {new_height}")
                    if new_height == last_height:
                        print("StealthScrapeTool: ScrollHeight unchanged. Breaking scroll loop.")
                        break
                    last_height = new_height
                    scroll_attempts += 1
                print("StealthScrapeTool: Finished scrolling.")
                    
                print(f"StealthScrapeTool: Page loaded. Attempting to find element with selector '{css_element}'")
                
                # Element waiting logic
                selectors_to_wait_for = []
                if wait_for_selectors:
                    print("StealthScrapeTool: Additional selectors to wait for provided.")
                    selectors_to_wait_for.extend(wait_for_selectors)
                
                # Always include css_element in the list of selectors to wait for
                selectors_to_wait_for.append(css_element)

                combined_selector = ", ".join(selectors_to_wait_for)
                print(f"StealthScrapeTool: Waiting for selectors: {combined_selector}")
                await page.wait_for_selector(combined_selector, timeout=60000, state='attached')
                
                
                print("StealthScrapeTool: Required elements found. Extracting content...")
                html_content = await page.content()
                soup = BeautifulSoup(html_content, 'html.parser')
               
                # Debug print to confirm if waited-for elements are in the scraped content
                if soup.select_one("#all-reviews"):
                    print("StealthScrapeTool: #all-reviews found in scraped content.")
                else:
                    print("StealthScrapeTool: #all-reviews NOT found in scraped content.")

                target_element = soup.select_one(css_element)
                if target_element:
                    # Clean the HTML content
                    print(f"Successfully found element with selector '{css_element}'. Cleaning content...")
                    for script in target_element.find_all("script"):
                        script.decompose()
                    for style_tag in target_element.find_all("style"):
                        style_tag.decompose()
                    for img in target_element.find_all("img"):
                        img.decompose()
                    for svg in target_element.find_all("svg"):
                        svg.decompose()
                    for iframe in target_element.find_all("iframe"):
                        iframe.decompose()
                    for source_tag in target_element.find_all("source"):
                        source_tag.decompose()

                    # Remove style attributes from all tags
                    for tag in target_element.find_all(True):
                        if 'style' in tag.attrs:
                            del tag['style']

                    return target_element.prettify()
                else:
                    return f"Error: Could not find element with selector '{css_element}' on the page."
        except Exception as e:
            return f"Error during stealth web scraping: {e}"

    def _run(self, website_url: str, css_element: str, wait_for_selectors: list[str] = None) -> str:
        # This method is for synchronous execution, which is not ideal for Playwright.
        # CrewAI typically calls _arun for async tools.
        # For simplicity, we'll just call the async version here.
        return asyncio.run(self._arun(website_url, css_element, wait_for_selectors))