File size: 8,443 Bytes
03c0888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import os, sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig      
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.chunking_strategy import RegexChunking
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

# Category 1: Browser Configuration Tests
async def test_browser_config_object():
    """Test the new BrowserConfig object with various browser settings"""
    browser_config = BrowserConfig(
        browser_type="chromium",
        headless=False,
        viewport_width=1920,
        viewport_height=1080,
        use_managed_browser=True,
        user_agent_mode="random",
        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
    )
    
    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
        result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS)
        assert result.success, "Browser config crawl failed"
        assert len(result.html) > 0, "No HTML content retrieved"

async def test_browser_performance_config():
    """Test browser configurations focused on performance"""
    browser_config = BrowserConfig(
        text_mode=True,
        light_mode=True,
        extra_args=['--disable-gpu', '--disable-software-rasterizer'],
        ignore_https_errors=True,
        java_script_enabled=False
    )
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun('https://example.com')
        assert result.success, "Performance optimized crawl failed"
        assert result.status_code == 200, "Unexpected status code"

# Category 2: Content Processing Tests
async def test_content_extraction_config():
    """Test content extraction with various strategies"""
    crawler_config = CrawlerRunConfig(
        word_count_threshold=300,
        extraction_strategy=JsonCssExtractionStrategy(
            schema={
                "name": "article",
                "baseSelector": "div",
                "fields": [{
                    "name": "title",
                    "selector": "h1",
                    "type": "text"
                }]
            }
        ),
        chunking_strategy=RegexChunking(),
        content_filter=PruningContentFilter()
    )
    
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            'https://example.com/article',
            config=crawler_config
        )
        assert result.extracted_content is not None, "Content extraction failed"
        assert 'title' in result.extracted_content, "Missing expected content field"

# Category 3: Cache and Session Management Tests
async def test_cache_and_session_management():
    """Test different cache modes and session handling"""
    browser_config = BrowserConfig(use_persistent_context=True)
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.WRITE_ONLY,
        process_iframes=True,
        remove_overlay_elements=True
    )
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # First request - should write to cache
        result1 = await crawler.arun(
            'https://example.com',
            config=crawler_config
        )
        
        # Second request - should use fresh fetch due to WRITE_ONLY mode
        result2 = await crawler.arun(
            'https://example.com',
            config=crawler_config
        )
        
        assert result1.success and result2.success, "Cache mode crawl failed"
        assert result1.html == result2.html, "Inconsistent results between requests"

# Category 4: Media Handling Tests
async def test_media_handling_config():
    """Test configurations related to media handling"""
    # Get the base path for home directroy ~/.crawl4ai/downloads, make sure it exists
    os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True)
    browser_config = BrowserConfig(
        viewport_width=1920,
        viewport_height=1080,
        accept_downloads=True,
        downloads_path= os.path.expanduser("~/.crawl4ai/downloads")
    )
    crawler_config = CrawlerRunConfig(
        screenshot=True,
        pdf=True,
        adjust_viewport_to_content=True,
        wait_for_images=True,
        screenshot_height_threshold=20000
    )
    
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            'https://example.com',
            config=crawler_config
        )
        assert result.screenshot is not None, "Screenshot capture failed"
        assert result.pdf is not None, "PDF generation failed"

# Category 5: Anti-Bot and Site Interaction Tests
async def test_antibot_config():
    """Test configurations for handling anti-bot measures"""
    crawler_config = CrawlerRunConfig(
        simulate_user=True,
        override_navigator=True,
        magic=True,
        wait_for="js:()=>document.querySelector('body')",
        delay_before_return_html=1.0,
        log_console=True,
        cache_mode=CacheMode.BYPASS
    )
    
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            'https://example.com',
            config=crawler_config
        )
        assert result.success, "Anti-bot measure handling failed"

# Category 6: Parallel Processing Tests
async def test_parallel_processing():
    """Test parallel processing capabilities"""
    crawler_config = CrawlerRunConfig(
        mean_delay=0.5,
        max_range=1.0,
        semaphore_count=5
    )
    
    urls = [
        'https://example.com/1',
        'https://example.com/2',
        'https://example.com/3'
    ]
    
    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun_many(
            urls,
            config=crawler_config
        )
        assert len(results) == len(urls), "Not all URLs were processed"
        assert all(r.success for r in results), "Some parallel requests failed"

# Category 7: Backwards Compatibility Tests
async def test_legacy_parameter_support():
    """Test that legacy parameters still work"""
    async with AsyncWebCrawler(
        headless=True,
        browser_type="chromium",
        viewport_width=1024,
        viewport_height=768
    ) as crawler:
        result = await crawler.arun(
            'https://example.com',
            screenshot=True,
            word_count_threshold=200,
            bypass_cache=True,
            css_selector=".main-content"
        )
        assert result.success, "Legacy parameter support failed"

# Category 8: Mixed Configuration Tests
async def test_mixed_config_usage():
    """Test mixing new config objects with legacy parameters"""
    browser_config = BrowserConfig(headless=True)
    crawler_config = CrawlerRunConfig(screenshot=True)
    
    async with AsyncWebCrawler(
        config=browser_config,
        verbose=True  # legacy parameter
    ) as crawler:
        result = await crawler.arun(
            'https://example.com',
            config=crawler_config,
            cache_mode=CacheMode.BYPASS,  # legacy parameter
            css_selector="body"  # legacy parameter
        )
        assert result.success, "Mixed configuration usage failed"

if __name__ == "__main__":
    async def run_tests():
        test_functions = [
            test_browser_config_object,
            # test_browser_performance_config,
            # test_content_extraction_config,
            # test_cache_and_session_management,
            # test_media_handling_config,
            # test_antibot_config,
            # test_parallel_processing,
            # test_legacy_parameter_support,
            # test_mixed_config_usage
        ]
        
        for test in test_functions:
            print(f"\nRunning {test.__name__}...")
            try:
                await test()
                print(f"✓ {test.__name__} passed")
            except AssertionError as e:
                print(f"✗ {test.__name__} failed: {str(e)}")
            except Exception as e:
                print(f"✗ {test.__name__} error: {str(e)}")
    
    asyncio.run(run_tests())