File size: 2,780 Bytes
03c0888 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import os
import sys
import pytest
import asyncio
import json
# Add the parent directory to the Python path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from crawl4ai.async_webcrawler import AsyncWebCrawler
@pytest.mark.asyncio
async def test_cache_url():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.example.com"
# First run to cache the URL
result1 = await crawler.arun(url=url, bypass_cache=True)
assert result1.success
# Second run to retrieve from cache
result2 = await crawler.arun(url=url, bypass_cache=False)
assert result2.success
assert result2.html == result1.html
@pytest.mark.asyncio
async def test_bypass_cache():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.python.org"
# First run to cache the URL
result1 = await crawler.arun(url=url, bypass_cache=True)
assert result1.success
# Second run bypassing cache
result2 = await crawler.arun(url=url, bypass_cache=True)
assert result2.success
assert result2.html != result1.html # Content might be different due to dynamic nature of websites
@pytest.mark.asyncio
async def test_cache_size():
async with AsyncWebCrawler(verbose=True) as crawler:
initial_size = await crawler.aget_cache_size()
url = "https://www.nbcnews.com/business"
await crawler.arun(url=url, bypass_cache=True)
new_size = await crawler.aget_cache_size()
assert new_size == initial_size + 1
@pytest.mark.asyncio
async def test_clear_cache():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.example.org"
await crawler.arun(url=url, bypass_cache=True)
initial_size = await crawler.aget_cache_size()
assert initial_size > 0
await crawler.aclear_cache()
new_size = await crawler.aget_cache_size()
assert new_size == 0
@pytest.mark.asyncio
async def test_flush_cache():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.example.net"
await crawler.arun(url=url, bypass_cache=True)
initial_size = await crawler.aget_cache_size()
assert initial_size > 0
await crawler.aflush_cache()
new_size = await crawler.aget_cache_size()
assert new_size == 0
# Try to retrieve the previously cached URL
result = await crawler.arun(url=url, bypass_cache=False)
assert result.success # The crawler should still succeed, but it will fetch the content anew
# Entry point for debugging
if __name__ == "__main__":
pytest.main([__file__, "-v"]) |