|
|
|
import os, sys |
|
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
sys.path.append(parent_dir) |
|
|
|
import asyncio |
|
from crawl4ai import AsyncWebCrawler |
|
|
|
async def main(): |
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler: |
|
|
|
urls = [ |
|
"https://example.com", |
|
"https://python.org", |
|
"https://github.com", |
|
"https://stackoverflow.com", |
|
"https://news.ycombinator.com" |
|
] |
|
|
|
|
|
word_count_threshold = 100 |
|
|
|
|
|
results = await crawler.arun_many( |
|
urls=urls, |
|
word_count_threshold=word_count_threshold, |
|
bypass_cache=True, |
|
verbose=True |
|
) |
|
|
|
|
|
for result in results: |
|
if result.success: |
|
print(f"Successfully crawled: {result.url}") |
|
print(f"Title: {result.metadata.get('title', 'N/A')}") |
|
print(f"Word count: {len(result.markdown.split())}") |
|
print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}") |
|
print(f"Number of images: {len(result.media.get('images', []))}") |
|
print("---") |
|
else: |
|
print(f"Failed to crawl: {result.url}") |
|
print(f"Error: {result.error_message}") |
|
print("---") |
|
|
|
if __name__ == "__main__": |
|
asyncio.run(main()) |