AIE5-MidTerm / scraper /async_crawler.py
thomfoolery's picture
first attempt
465a7e3
#!/usr/bin/env python3
import asyncio
import logging
from pathlib import Path
from typing import Set, Dict
import aiohttp
from bs4 import BeautifulSoup
from yarl import URL
import json
import re
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AsyncCrawler:
def __init__(self, start_url: str, max_concurrent: int = 100):
self.start_url = URL(start_url)
self.base_domain = self.start_url.host
self.base_path = str(self.start_url).split(self.base_domain)[1]
self.visited_urls: Set[str] = set()
self.url_queue: asyncio.Queue = asyncio.Queue()
self.semaphore = asyncio.Semaphore(max_concurrent)
self.session: aiohttp.ClientSession = None
self.data_dir = Path("data/scraped")
self.sitemap: Dict[str, list] = {}
async def init_session(self):
"""Initialize aiohttp session with optimal settings."""
timeout = aiohttp.ClientTimeout(total=10)
connector = aiohttp.TCPConnector(limit=100, ttl_dns_cache=300)
self.session = aiohttp.ClientSession(
timeout=timeout,
connector=connector,
headers={"User-Agent": "ShopBot/1.0"}
)
def is_valid_url(self, url: URL) -> bool:
"""Check if URL should be crawled."""
return (
str(url).startswith(str(self.start_url))
and url.scheme in ("http", "https")
and not url.fragment
)
async def process_page(self, url: str, html: str) -> Set[str]:
"""Extract links and save raw HTML."""
# Regex pattern for Markdown links
pattern = r'\[.*?\]\((https?://[^\)]+|/[^)]+|[^\)]+)\)'
# Find all matches
markdown_links = re.findall(pattern, html)
soup = BeautifulSoup(html, 'html.parser')
anchor_links = [a['href'] for a in soup.find_all('a', href=True)]
links = markdown_links + anchor_links
absolute_links = [
str(URL(link)) if URL(link).host else str(self.start_url.join(URL(link)))
for link in links
]
# concatenate the two sets
# Filter out invalid URLs
valid_links = {
link for link in absolute_links
if self.is_valid_url(URL
(link))
}
# Save raw HTML
# extract just the path from the url
path = url.split(self.base_domain)[1]
raw_filepath = self.data_dir / 'raw' / path.replace("/", "_").replace("_docs_apps_build_", "")
raw_filepath.parent.mkdir(parents=True, exist_ok=True)
raw_filepath.write_text(html)
# raw_filepath.write_text(self.strip_all_html_tags_from_markdown(html))
# Update sitemap
self.sitemap[url] = list(valid_links)
return valid_links
async def fetch_page(self, url: str) -> None:
"""Fetch and process a single page."""
if url in self.visited_urls:
return
self.visited_urls.add(url)
try:
async with self.semaphore:
async with self.session.get(url) as response:
if response.status == 200:
html = await response.text()
new_urls = await self.process_page(url, html)
for new_url in new_urls:
if new_url not in self.visited_urls:
await self.url_queue.put(new_url)
logger.info(f"Successfully processed: {url}")
else:
logger.warning(f"Failed to fetch {url}: {response.status}")
except Exception as e:
logger.error(f"Error processing {url}: {str(e)}")
def strip_all_html_tags_from_markdown(self, markdown: str) -> str:
"""Remove all HTML tags from a string, except for opening and closing script tags."""
# Define regex patterns to remove specific HTML tags
patterns = [
r'<div class="react-code-block" data-preset="file">\n',
r'<div class="react-code-block" data-preset="basic">\n',
r'<div class="react-code-block" data-preset="terminal">\n',
r'<div class="react-code-block-preload ThemeMode-dim">\n',
r'<div class="react-code-block-preload-bar "></div>\n',
r'<div class="react-code-block-preload-bar basic-codeblock">',
r'<div class="react-code-block-preload-placeholder-container">\n',
r'<div class="react-code-block-preload-code-container">\n',
r'<div class="react-code-block-preload-codeline-number"></div>\n',
r'<div class="react-code-block-preload-codeline"></div>\n',
r'<script data-option=[^>]+ data-value=[^>]+></script>\n',
r'<div>\n',
r'</div>\n',
r'<br>\n',
r'<p>\n',
r'</p>\n',
# r'<(?!script\b)[^>]+>',
# r'</(?!script\b)[^>]+>',
r'END_RAW_MD_CONTENT',
r'RAW_MD_CONTENT',
]
# Remove all matched patterns from the markdown
for pattern in patterns:
markdown = re.sub(pattern, '', markdown)
markdown = re.sub(r'<script type="text/plain"[^>]+language="([^"]+)"[^>]*>', r'```\1', markdown)
markdown = re.sub(r'</script>', '```', markdown)
# replace 3 or more new lines with 2 new lines
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
return markdown
def clean_raw_markdown(self):
"""Clean raw markdown files by stripping HTML tags."""
raw_dir = self.data_dir / 'raw'
for raw_file in raw_dir.glob('*.txt'):
content = raw_file.read_text()
cleaned_content = self.strip_all_html_tags_from_markdown(content)
raw_filepath = self.data_dir / 'clean' / raw_file.name
raw_filepath.parent.mkdir(parents=True, exist_ok=True)
raw_filepath.write_text(cleaned_content)
async def run(self):
"""Main crawler execution."""
# Create data directory
self.data_dir.mkdir(parents=True, exist_ok=True)
await self.init_session()
await self.url_queue.put(str(self.start_url))
try:
workers = []
while True:
if self.url_queue.empty() and not workers:
break
while not self.url_queue.empty():
url = await self.url_queue.get() + '.txt'
if url not in self.visited_urls:
worker = asyncio.create_task(self.fetch_page(url))
workers.append(worker)
if workers:
done, pending = await asyncio.wait(
workers,
return_when=asyncio.FIRST_COMPLETED
)
workers = list(pending)
for task in done:
await task
finally:
# Save sitemap
sitemap_path = self.data_dir / "_sitemap.json"
sitemap_path.write_text(json.dumps(self.sitemap, indent=2))
self.clean_raw_markdown()
await self.session.close()
logger.info(f"Crawl completed. Processed {len(self.visited_urls)} pages.")
async def main():
start_url = "https://shopify.dev/docs/apps/build/flow"
crawler = AsyncCrawler(start_url)
await crawler.run()
if __name__ == "__main__":
asyncio.run(main())