import asyncio import aiohttp import json import time import os from typing import Optional, Dict, Any from pydantic import BaseModel, HttpUrl class NBCNewsAPITest: def __init__(self, base_url: str = "http://localhost:8000"): self.base_url = base_url self.session = None async def __aenter__(self): self.session = aiohttp.ClientSession() return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.session: await self.session.close() async def submit_crawl(self, request_data: Dict[str, Any]) -> str: async with self.session.post(f"{self.base_url}/crawl", json=request_data) as response: result = await response.json() return result["task_id"] async def get_task_status(self, task_id: str) -> Dict[str, Any]: async with self.session.get(f"{self.base_url}/task/{task_id}") as response: return await response.json() async def wait_for_task(self, task_id: str, timeout: int = 300, poll_interval: int = 2) -> Dict[str, Any]: start_time = time.time() while True: if time.time() - start_time > timeout: raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") status = await self.get_task_status(task_id) if status["status"] in ["completed", "failed"]: return status await asyncio.sleep(poll_interval) async def check_health(self) -> Dict[str, Any]: async with self.session.get(f"{self.base_url}/health") as response: return await response.json() async def test_basic_crawl(): print("\n=== Testing Basic Crawl ===") async with NBCNewsAPITest() as api: request = { "urls": "https://www.nbcnews.com/business", "priority": 10 } task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) print(f"Basic crawl result length: {len(result['result']['markdown'])}") assert result["status"] == "completed" assert "result" in result assert result["result"]["success"] async def test_js_execution(): print("\n=== Testing JS Execution ===") async with NBCNewsAPITest() as api: request = { "urls": "https://www.nbcnews.com/business", "priority": 8, "js_code": [ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" ], "wait_for": "article.tease-card:nth-child(10)", "crawler_params": { "headless": True } } task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) print(f"JS execution result length: {len(result['result']['markdown'])}") assert result["status"] == "completed" assert result["result"]["success"] async def test_css_selector(): print("\n=== Testing CSS Selector ===") async with NBCNewsAPITest() as api: request = { "urls": "https://www.nbcnews.com/business", "priority": 7, "css_selector": ".wide-tease-item__description" } task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) print(f"CSS selector result length: {len(result['result']['markdown'])}") assert result["status"] == "completed" assert result["result"]["success"] async def test_structured_extraction(): print("\n=== Testing Structured Extraction ===") async with NBCNewsAPITest() as api: schema = { "name": "NBC News Articles", "baseSelector": "article.tease-card", "fields": [ { "name": "title", "selector": "h2", "type": "text" }, { "name": "description", "selector": ".tease-card__description", "type": "text" }, { "name": "link", "selector": "a", "type": "attribute", "attribute": "href" } ] } request = { "urls": "https://www.nbcnews.com/business", "priority": 9, "extraction_config": { "type": "json_css", "params": { "schema": schema } } } task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) extracted = json.loads(result["result"]["extracted_content"]) print(f"Extracted {len(extracted)} articles") assert result["status"] == "completed" assert result["result"]["success"] assert len(extracted) > 0 async def test_batch_crawl(): print("\n=== Testing Batch Crawl ===") async with NBCNewsAPITest() as api: request = { "urls": [ "https://www.nbcnews.com/business", "https://www.nbcnews.com/business/consumer", "https://www.nbcnews.com/business/economy" ], "priority": 6, "crawler_params": { "headless": True } } task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) print(f"Batch crawl completed, got {len(result['results'])} results") assert result["status"] == "completed" assert "results" in result assert len(result["results"]) == 3 async def test_llm_extraction(): print("\n=== Testing LLM Extraction with Ollama ===") async with NBCNewsAPITest() as api: schema = { "type": "object", "properties": { "article_title": { "type": "string", "description": "The main title of the news article" }, "summary": { "type": "string", "description": "A brief summary of the article content" }, "main_topics": { "type": "array", "items": {"type": "string"}, "description": "Main topics or themes discussed in the article" } }, "required": ["article_title", "summary", "main_topics"] } request = { "urls": "https://www.nbcnews.com/business", "priority": 8, "extraction_config": { "type": "llm", "params": { "provider": "openai/gpt-4o-mini", "api_key": os.getenv("OLLAMA_API_KEY"), "schema": schema, "extraction_type": "schema", "instruction": """Extract the main article information including title, a brief summary, and main topics discussed. Focus on the primary business news article on the page.""" } }, "crawler_params": { "headless": True, "word_count_threshold": 1 } } task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) if result["status"] == "completed": extracted = json.loads(result["result"]["extracted_content"]) print(f"Extracted article analysis:") print(json.dumps(extracted, indent=2)) assert result["status"] == "completed" assert result["result"]["success"] async def test_screenshot(): print("\n=== Testing Screenshot ===") async with NBCNewsAPITest() as api: request = { "urls": "https://www.nbcnews.com/business", "priority": 5, "screenshot": True, "crawler_params": { "headless": True } } task_id = await api.submit_crawl(request) result = await api.wait_for_task(task_id) print("Screenshot captured:", bool(result["result"]["screenshot"])) assert result["status"] == "completed" assert result["result"]["success"] assert result["result"]["screenshot"] is not None async def test_priority_handling(): print("\n=== Testing Priority Handling ===") async with NBCNewsAPITest() as api: # Submit low priority task first low_priority = { "urls": "https://www.nbcnews.com/business", "priority": 1, "crawler_params": {"headless": True} } low_task_id = await api.submit_crawl(low_priority) # Submit high priority task high_priority = { "urls": "https://www.nbcnews.com/business/consumer", "priority": 10, "crawler_params": {"headless": True} } high_task_id = await api.submit_crawl(high_priority) # Get both results high_result = await api.wait_for_task(high_task_id) low_result = await api.wait_for_task(low_task_id) print("Both tasks completed") assert high_result["status"] == "completed" assert low_result["status"] == "completed" async def main(): try: # Start with health check async with NBCNewsAPITest() as api: health = await api.check_health() print("Server health:", health) # Run all tests # await test_basic_crawl() # await test_js_execution() # await test_css_selector() # await test_structured_extraction() await test_llm_extraction() # await test_batch_crawl() # await test_screenshot() # await test_priority_handling() except Exception as e: print(f"Test failed: {str(e)}") raise if __name__ == "__main__": asyncio.run(main())