import os, sys parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(parent_dir) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) import asyncio from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.chunking_strategy import RegexChunking from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator # Category 1: Browser Configuration Tests async def test_browser_config_object(): """Test the new BrowserConfig object with various browser settings""" browser_config = BrowserConfig( browser_type="chromium", headless=False, viewport_width=1920, viewport_height=1080, use_managed_browser=True, user_agent_mode="random", user_agent_generator_config={"device_type": "desktop", "os_type": "windows"} ) async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler: result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS) assert result.success, "Browser config crawl failed" assert len(result.html) > 0, "No HTML content retrieved" async def test_browser_performance_config(): """Test browser configurations focused on performance""" browser_config = BrowserConfig( text_mode=True, light_mode=True, extra_args=['--disable-gpu', '--disable-software-rasterizer'], ignore_https_errors=True, java_script_enabled=False ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun('https://example.com') assert result.success, "Performance optimized crawl failed" assert result.status_code == 200, "Unexpected status code" # Category 2: Content Processing Tests async def test_content_extraction_config(): """Test content extraction with various strategies""" crawler_config = CrawlerRunConfig( word_count_threshold=300, extraction_strategy=JsonCssExtractionStrategy( schema={ "name": "article", "baseSelector": "div", "fields": [{ "name": "title", "selector": "h1", "type": "text" }] } ), chunking_strategy=RegexChunking(), content_filter=PruningContentFilter() ) async with AsyncWebCrawler() as crawler: result = await crawler.arun( 'https://example.com/article', config=crawler_config ) assert result.extracted_content is not None, "Content extraction failed" assert 'title' in result.extracted_content, "Missing expected content field" # Category 3: Cache and Session Management Tests async def test_cache_and_session_management(): """Test different cache modes and session handling""" browser_config = BrowserConfig(use_persistent_context=True) crawler_config = CrawlerRunConfig( cache_mode=CacheMode.WRITE_ONLY, process_iframes=True, remove_overlay_elements=True ) async with AsyncWebCrawler(config=browser_config) as crawler: # First request - should write to cache result1 = await crawler.arun( 'https://example.com', config=crawler_config ) # Second request - should use fresh fetch due to WRITE_ONLY mode result2 = await crawler.arun( 'https://example.com', config=crawler_config ) assert result1.success and result2.success, "Cache mode crawl failed" assert result1.html == result2.html, "Inconsistent results between requests" # Category 4: Media Handling Tests async def test_media_handling_config(): """Test configurations related to media handling""" # Get the base path for home directroy ~/.crawl4ai/downloads, make sure it exists os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True) browser_config = BrowserConfig( viewport_width=1920, viewport_height=1080, accept_downloads=True, downloads_path= os.path.expanduser("~/.crawl4ai/downloads") ) crawler_config = CrawlerRunConfig( screenshot=True, pdf=True, adjust_viewport_to_content=True, wait_for_images=True, screenshot_height_threshold=20000 ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( 'https://example.com', config=crawler_config ) assert result.screenshot is not None, "Screenshot capture failed" assert result.pdf is not None, "PDF generation failed" # Category 5: Anti-Bot and Site Interaction Tests async def test_antibot_config(): """Test configurations for handling anti-bot measures""" crawler_config = CrawlerRunConfig( simulate_user=True, override_navigator=True, magic=True, wait_for="js:()=>document.querySelector('body')", delay_before_return_html=1.0, log_console=True, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: result = await crawler.arun( 'https://example.com', config=crawler_config ) assert result.success, "Anti-bot measure handling failed" # Category 6: Parallel Processing Tests async def test_parallel_processing(): """Test parallel processing capabilities""" crawler_config = CrawlerRunConfig( mean_delay=0.5, max_range=1.0, semaphore_count=5 ) urls = [ 'https://example.com/1', 'https://example.com/2', 'https://example.com/3' ] async with AsyncWebCrawler() as crawler: results = await crawler.arun_many( urls, config=crawler_config ) assert len(results) == len(urls), "Not all URLs were processed" assert all(r.success for r in results), "Some parallel requests failed" # Category 7: Backwards Compatibility Tests async def test_legacy_parameter_support(): """Test that legacy parameters still work""" async with AsyncWebCrawler( headless=True, browser_type="chromium", viewport_width=1024, viewport_height=768 ) as crawler: result = await crawler.arun( 'https://example.com', screenshot=True, word_count_threshold=200, bypass_cache=True, css_selector=".main-content" ) assert result.success, "Legacy parameter support failed" # Category 8: Mixed Configuration Tests async def test_mixed_config_usage(): """Test mixing new config objects with legacy parameters""" browser_config = BrowserConfig(headless=True) crawler_config = CrawlerRunConfig(screenshot=True) async with AsyncWebCrawler( config=browser_config, verbose=True # legacy parameter ) as crawler: result = await crawler.arun( 'https://example.com', config=crawler_config, cache_mode=CacheMode.BYPASS, # legacy parameter css_selector="body" # legacy parameter ) assert result.success, "Mixed configuration usage failed" if __name__ == "__main__": async def run_tests(): test_functions = [ test_browser_config_object, # test_browser_performance_config, # test_content_extraction_config, # test_cache_and_session_management, # test_media_handling_config, # test_antibot_config, # test_parallel_processing, # test_legacy_parameter_support, # test_mixed_config_usage ] for test in test_functions: print(f"\nRunning {test.__name__}...") try: await test() print(f"✓ {test.__name__} passed") except AssertionError as e: print(f"✗ {test.__name__} failed: {str(e)}") except Exception as e: print(f"✗ {test.__name__} error: {str(e)}") asyncio.run(run_tests())