# import os | |
# import sys | |
# import pytest | |
# import asyncio | |
# # Add the parent directory to the Python path | |
# parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
# sys.path.append(parent_dir) | |
# from crawl4ai.async_webcrawler import AsyncWebCrawler | |
# from crawl4ai.utils import InvalidCSSSelectorError | |
# class AsyncCrawlerWrapper: | |
# def __init__(self): | |
# self.crawler = None | |
# async def setup(self): | |
# self.crawler = AsyncWebCrawler(verbose=True) | |
# await self.crawler.awarmup() | |
# async def cleanup(self): | |
# if self.crawler: | |
# await self.crawler.aclear_cache() | |
# @pytest.fixture(scope="module") | |
# def crawler_wrapper(): | |
# wrapper = AsyncCrawlerWrapper() | |
# asyncio.get_event_loop().run_until_complete(wrapper.setup()) | |
# yield wrapper | |
# asyncio.get_event_loop().run_until_complete(wrapper.cleanup()) | |
# @pytest.mark.asyncio | |
# async def test_network_error(crawler_wrapper): | |
# url = "https://www.nonexistentwebsite123456789.com" | |
# result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True) | |
# assert not result.success | |
# assert "Failed to crawl" in result.error_message | |
# # @pytest.mark.asyncio | |
# # async def test_timeout_error(crawler_wrapper): | |
# # # Simulating a timeout by using a very short timeout value | |
# # url = "https://www.nbcnews.com/business" | |
# # result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, timeout=0.001) | |
# # assert not result.success | |
# # assert "timeout" in result.error_message.lower() | |
# # @pytest.mark.asyncio | |
# # async def test_invalid_css_selector(crawler_wrapper): | |
# # url = "https://www.nbcnews.com/business" | |
# # with pytest.raises(InvalidCSSSelectorError): | |
# # await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, css_selector="invalid>>selector") | |
# # @pytest.mark.asyncio | |
# # async def test_js_execution_error(crawler_wrapper): | |
# # url = "https://www.nbcnews.com/business" | |
# # invalid_js = "This is not valid JavaScript code;" | |
# # result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True, js=invalid_js) | |
# # assert not result.success | |
# # assert "JavaScript" in result.error_message | |
# # @pytest.mark.asyncio | |
# # async def test_empty_page(crawler_wrapper): | |
# # # Use a URL that typically returns an empty page | |
# # url = "http://example.com/empty" | |
# # result = await crawler_wrapper.crawler.arun(url=url, bypass_cache=True) | |
# # assert result.success # The crawl itself should succeed | |
# # assert not result.markdown.strip() # The markdown content should be empty or just whitespace | |
# # @pytest.mark.asyncio | |
# # async def test_rate_limiting(crawler_wrapper): | |
# # # Simulate rate limiting by making multiple rapid requests | |
# # url = "https://www.nbcnews.com/business" | |
# # results = await asyncio.gather(*[crawler_wrapper.crawler.arun(url=url, bypass_cache=True) for _ in range(10)]) | |
# # assert any(not result.success and "rate limit" in result.error_message.lower() for result in results) | |
# # Entry point for debugging | |
# if __name__ == "__main__": | |
# pytest.main([__file__, "-v"]) |