|
import os |
|
import sys |
|
import asyncio |
|
import shutil |
|
from typing import List |
|
import tempfile |
|
import time |
|
|
|
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
sys.path.append(parent_dir) |
|
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler |
|
|
|
class TestDownloads: |
|
def __init__(self): |
|
self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_") |
|
self.download_dir = os.path.join(self.temp_dir, "downloads") |
|
os.makedirs(self.download_dir, exist_ok=True) |
|
self.results: List[str] = [] |
|
|
|
def cleanup(self): |
|
shutil.rmtree(self.temp_dir) |
|
|
|
def log_result(self, test_name: str, success: bool, message: str = ""): |
|
result = f"{'β
' if success else 'β'} {test_name}: {message}" |
|
self.results.append(result) |
|
print(result) |
|
|
|
async def test_basic_download(self): |
|
"""Test basic file download functionality""" |
|
try: |
|
async with AsyncWebCrawler( |
|
accept_downloads=True, |
|
downloads_path=self.download_dir, |
|
verbose=True |
|
) as crawler: |
|
|
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code=""" |
|
// Click first download link |
|
const downloadLink = document.querySelector('a[href$=".exe"]'); |
|
if (downloadLink) downloadLink.click(); |
|
""" |
|
) |
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 0 |
|
self.log_result( |
|
"Basic Download", |
|
success, |
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" |
|
) |
|
except Exception as e: |
|
self.log_result("Basic Download", False, str(e)) |
|
|
|
async def test_persistent_context_download(self): |
|
"""Test downloads with persistent context""" |
|
try: |
|
user_data_dir = os.path.join(self.temp_dir, "user_data") |
|
os.makedirs(user_data_dir, exist_ok=True) |
|
|
|
async with AsyncWebCrawler( |
|
accept_downloads=True, |
|
downloads_path=self.download_dir, |
|
use_persistent_context=True, |
|
user_data_dir=user_data_dir, |
|
verbose=True |
|
) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code=""" |
|
const downloadLink = document.querySelector('a[href$=".exe"]'); |
|
if (downloadLink) downloadLink.click(); |
|
""" |
|
) |
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 0 |
|
self.log_result( |
|
"Persistent Context Download", |
|
success, |
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" |
|
) |
|
except Exception as e: |
|
self.log_result("Persistent Context Download", False, str(e)) |
|
|
|
async def test_multiple_downloads(self): |
|
"""Test multiple simultaneous downloads""" |
|
try: |
|
async with AsyncWebCrawler( |
|
accept_downloads=True, |
|
downloads_path=self.download_dir, |
|
verbose=True |
|
) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code=""" |
|
// Click multiple download links |
|
const downloadLinks = document.querySelectorAll('a[href$=".exe"]'); |
|
downloadLinks.forEach(link => link.click()); |
|
""" |
|
) |
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 1 |
|
self.log_result( |
|
"Multiple Downloads", |
|
success, |
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded" |
|
) |
|
except Exception as e: |
|
self.log_result("Multiple Downloads", False, str(e)) |
|
|
|
async def test_different_browsers(self): |
|
"""Test downloads across different browser types""" |
|
browsers = ["chromium", "firefox", "webkit"] |
|
|
|
for browser_type in browsers: |
|
try: |
|
async with AsyncWebCrawler( |
|
accept_downloads=True, |
|
downloads_path=self.download_dir, |
|
browser_type=browser_type, |
|
verbose=True |
|
) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code=""" |
|
const downloadLink = document.querySelector('a[href$=".exe"]'); |
|
if (downloadLink) downloadLink.click(); |
|
""" |
|
) |
|
|
|
success = result.downloaded_files is not None and len(result.downloaded_files) > 0 |
|
self.log_result( |
|
f"{browser_type.title()} Download", |
|
success, |
|
f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" |
|
) |
|
except Exception as e: |
|
self.log_result(f"{browser_type.title()} Download", False, str(e)) |
|
|
|
async def test_edge_cases(self): |
|
"""Test various edge cases""" |
|
|
|
|
|
try: |
|
async with AsyncWebCrawler( |
|
accept_downloads=True, |
|
verbose=True |
|
) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code="document.querySelector('a[href$=\".exe\"]').click()" |
|
) |
|
self.log_result( |
|
"Default Download Path", |
|
True, |
|
f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}" |
|
) |
|
except Exception as e: |
|
self.log_result("Default Download Path", False, str(e)) |
|
|
|
|
|
try: |
|
async with AsyncWebCrawler( |
|
accept_downloads=True, |
|
downloads_path="/invalid/path/that/doesnt/exist", |
|
verbose=True |
|
) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code="document.querySelector('a[href$=\".exe\"]').click()" |
|
) |
|
self.log_result("Invalid Download Path", False, "Should have raised an error") |
|
except Exception as e: |
|
self.log_result("Invalid Download Path", True, "Correctly handled invalid path") |
|
|
|
|
|
try: |
|
async with AsyncWebCrawler( |
|
accept_downloads=False, |
|
verbose=True |
|
) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code="document.querySelector('a[href$=\".exe\"]').click()" |
|
) |
|
success = result.downloaded_files is None |
|
self.log_result( |
|
"Disabled Downloads", |
|
success, |
|
"Correctly ignored downloads" if success else "Unexpectedly downloaded files" |
|
) |
|
except Exception as e: |
|
self.log_result("Disabled Downloads", False, str(e)) |
|
|
|
async def run_all_tests(self): |
|
"""Run all test cases""" |
|
print("\nπ§ͺ Running Download Tests...\n") |
|
|
|
test_methods = [ |
|
self.test_basic_download, |
|
self.test_persistent_context_download, |
|
self.test_multiple_downloads, |
|
self.test_different_browsers, |
|
self.test_edge_cases |
|
] |
|
|
|
for test in test_methods: |
|
print(f"\nπ Running {test.__doc__}...") |
|
await test() |
|
await asyncio.sleep(2) |
|
|
|
print("\nπ Test Results Summary:") |
|
for result in self.results: |
|
print(result) |
|
|
|
successes = len([r for r in self.results if 'β
' in r]) |
|
total = len(self.results) |
|
print(f"\nTotal: {successes}/{total} tests passed") |
|
|
|
self.cleanup() |
|
|
|
async def main(): |
|
tester = TestDownloads() |
|
await tester.run_all_tests() |
|
|
|
if __name__ == "__main__": |
|
asyncio.run(main()) |