|
import os, sys |
|
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
sys.path.append(parent_dir) |
|
parent_parent_dir = os.path.dirname(parent_dir) |
|
sys.path.append(parent_parent_dir) |
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) |
|
__data__ = os.path.join(__location__, "__data") |
|
import asyncio |
|
from pathlib import Path |
|
import aiohttp |
|
import json |
|
from crawl4ai import AsyncWebCrawler, CacheMode |
|
from crawl4ai.content_filter_strategy import BM25ContentFilter |
|
|
|
|
|
async def download_example(): |
|
"""Example of downloading files from Python.org""" |
|
|
|
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") |
|
os.makedirs(downloads_path, exist_ok=True) |
|
|
|
print(f"Downloads will be saved to: {downloads_path}") |
|
|
|
async with AsyncWebCrawler( |
|
accept_downloads=True, |
|
downloads_path=downloads_path, |
|
verbose=True |
|
) as crawler: |
|
result = await crawler.arun( |
|
url="https://www.python.org/downloads/", |
|
js_code=""" |
|
// Find and click the first Windows installer link |
|
const downloadLink = document.querySelector('a[href$=".exe"]'); |
|
if (downloadLink) { |
|
console.log('Found download link:', downloadLink.href); |
|
downloadLink.click(); |
|
} else { |
|
console.log('No .exe download link found'); |
|
} |
|
""", |
|
delay_before_return_html=1, |
|
cache_mode=CacheMode.BYPASS |
|
) |
|
|
|
if result.downloaded_files: |
|
print("\nDownload successful!") |
|
print("Downloaded files:") |
|
for file_path in result.downloaded_files: |
|
print(f"- {file_path}") |
|
print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB") |
|
else: |
|
print("\nNo files were downloaded") |
|
|
|
|
|
async def local_and_raw_html_example(): |
|
"""Example of processing local files and raw HTML""" |
|
|
|
sample_file = os.path.join(__data__, "sample.html") |
|
with open(sample_file, "w") as f: |
|
f.write(""" |
|
<html><body> |
|
<h1>Test Content</h1> |
|
<p>This is a test paragraph.</p> |
|
</body></html> |
|
""") |
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler: |
|
|
|
local_result = await crawler.arun( |
|
url=f"file://{os.path.abspath(sample_file)}" |
|
) |
|
|
|
|
|
raw_html = """ |
|
<html><body> |
|
<h1>Raw HTML Test</h1> |
|
<p>This is a test of raw HTML processing.</p> |
|
</body></html> |
|
""" |
|
raw_result = await crawler.arun( |
|
url=f"raw:{raw_html}" |
|
) |
|
|
|
|
|
os.remove(sample_file) |
|
|
|
print("Local file content:", local_result.markdown) |
|
print("\nRaw HTML content:", raw_result.markdown) |
|
|
|
|
|
async def markdown_generation_example(): |
|
"""Example of enhanced markdown generation with citations and LLM-friendly features""" |
|
async with AsyncWebCrawler(verbose=True) as crawler: |
|
|
|
content_filter = BM25ContentFilter( |
|
|
|
bm25_threshold=1.0 |
|
) |
|
|
|
result = await crawler.arun( |
|
url="https://en.wikipedia.org/wiki/Apple", |
|
css_selector="main div#bodyContent", |
|
content_filter=content_filter, |
|
cache_mode=CacheMode.BYPASS |
|
) |
|
|
|
from crawl4ai import AsyncWebCrawler |
|
from crawl4ai.content_filter_strategy import BM25ContentFilter |
|
|
|
result = await crawler.arun( |
|
url="https://en.wikipedia.org/wiki/Apple", |
|
css_selector="main div#bodyContent", |
|
content_filter=BM25ContentFilter() |
|
) |
|
print(result.markdown_v2.fit_markdown) |
|
|
|
print("\nMarkdown Generation Results:") |
|
print(f"1. Original markdown length: {len(result.markdown)}") |
|
print(f"2. New markdown versions (markdown_v2):") |
|
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}") |
|
print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}") |
|
print(f" - References section length: {len(result.markdown_v2.references_markdown)}") |
|
if result.markdown_v2.fit_markdown: |
|
print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}") |
|
|
|
|
|
output_dir = os.path.join(__data__, "markdown_examples") |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f: |
|
f.write(result.markdown_v2.raw_markdown) |
|
|
|
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f: |
|
f.write(result.markdown_v2.markdown_with_citations) |
|
|
|
with open(os.path.join(output_dir, "3_references.md"), "w") as f: |
|
f.write(result.markdown_v2.references_markdown) |
|
|
|
if result.markdown_v2.fit_markdown: |
|
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f: |
|
f.write(result.markdown_v2.fit_markdown) |
|
|
|
print(f"\nMarkdown examples saved to: {output_dir}") |
|
|
|
|
|
print("\nSample of markdown with citations:") |
|
print(result.markdown_v2.markdown_with_citations[:500] + "...\n") |
|
print("Sample of references:") |
|
print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...") |
|
|
|
|
|
async def browser_management_example(): |
|
"""Example of using enhanced browser management features""" |
|
|
|
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") |
|
os.makedirs(user_data_dir, exist_ok=True) |
|
|
|
print(f"Browser profile will be saved to: {user_data_dir}") |
|
|
|
async with AsyncWebCrawler( |
|
use_managed_browser=True, |
|
user_data_dir=user_data_dir, |
|
headless=False, |
|
verbose=True |
|
) as crawler: |
|
|
|
result = await crawler.arun( |
|
url="https://crawl4ai.com", |
|
|
|
cache_mode=CacheMode.BYPASS |
|
) |
|
|
|
|
|
result = await crawler.arun( |
|
url="https://github.com/trending", |
|
|
|
cache_mode=CacheMode.BYPASS |
|
) |
|
|
|
print("\nBrowser session result:", result.success) |
|
if result.success: |
|
print("Page title:", result.metadata.get('title', 'No title found')) |
|
|
|
|
|
async def api_example(): |
|
"""Example of using the new API endpoints""" |
|
api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" |
|
headers = {'Authorization': f'Bearer {api_token}'} |
|
async with aiohttp.ClientSession() as session: |
|
|
|
crawl_request = { |
|
"urls": ["https://news.ycombinator.com"], |
|
"extraction_config": { |
|
"type": "json_css", |
|
"params": { |
|
"schema": { |
|
"name": "Hacker News Articles", |
|
"baseSelector": ".athing", |
|
"fields": [ |
|
{ |
|
"name": "title", |
|
"selector": ".title a", |
|
"type": "text" |
|
}, |
|
{ |
|
"name": "score", |
|
"selector": ".score", |
|
"type": "text" |
|
}, |
|
{ |
|
"name": "url", |
|
"selector": ".title a", |
|
"type": "attribute", |
|
"attribute": "href" |
|
} |
|
] |
|
} |
|
} |
|
}, |
|
"crawler_params": { |
|
"headless": True, |
|
|
|
}, |
|
"cache_mode": "bypass", |
|
|
|
|
|
} |
|
|
|
async with session.post( |
|
"http://localhost:11235/crawl", |
|
json=crawl_request, |
|
headers=headers |
|
) as response: |
|
task_data = await response.json() |
|
task_id = task_data["task_id"] |
|
|
|
|
|
while True: |
|
async with session.get( |
|
f"http://localhost:11235/task/{task_id}", |
|
headers=headers |
|
) as status_response: |
|
result = await status_response.json() |
|
print(f"Task status: {result['status']}") |
|
|
|
if result["status"] == "completed": |
|
print("Task completed!") |
|
print("Results:") |
|
news = json.loads(result["results"][0]['extracted_content']) |
|
print(json.dumps(news[:4], indent=2)) |
|
break |
|
else: |
|
await asyncio.sleep(1) |
|
|
|
|
|
async def main(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
await browser_management_example() |
|
|
|
|
|
await api_example() |
|
|
|
if __name__ == "__main__": |
|
asyncio.run(main()) |