Spaces:
Sleeping
Sleeping
File size: 7,713 Bytes
465a7e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
#!/usr/bin/env python3
import asyncio
import logging
from pathlib import Path
from typing import Set, Dict
import aiohttp
from bs4 import BeautifulSoup
from yarl import URL
import json
import re
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AsyncCrawler:
def __init__(self, start_url: str, max_concurrent: int = 100):
self.start_url = URL(start_url)
self.base_domain = self.start_url.host
self.base_path = str(self.start_url).split(self.base_domain)[1]
self.visited_urls: Set[str] = set()
self.url_queue: asyncio.Queue = asyncio.Queue()
self.semaphore = asyncio.Semaphore(max_concurrent)
self.session: aiohttp.ClientSession = None
self.data_dir = Path("data/scraped")
self.sitemap: Dict[str, list] = {}
async def init_session(self):
"""Initialize aiohttp session with optimal settings."""
timeout = aiohttp.ClientTimeout(total=10)
connector = aiohttp.TCPConnector(limit=100, ttl_dns_cache=300)
self.session = aiohttp.ClientSession(
timeout=timeout,
connector=connector,
headers={"User-Agent": "ShopBot/1.0"}
)
def is_valid_url(self, url: URL) -> bool:
"""Check if URL should be crawled."""
return (
str(url).startswith(str(self.start_url))
and url.scheme in ("http", "https")
and not url.fragment
)
async def process_page(self, url: str, html: str) -> Set[str]:
"""Extract links and save raw HTML."""
# Regex pattern for Markdown links
pattern = r'\[.*?\]\((https?://[^\)]+|/[^)]+|[^\)]+)\)'
# Find all matches
markdown_links = re.findall(pattern, html)
soup = BeautifulSoup(html, 'html.parser')
anchor_links = [a['href'] for a in soup.find_all('a', href=True)]
links = markdown_links + anchor_links
absolute_links = [
str(URL(link)) if URL(link).host else str(self.start_url.join(URL(link)))
for link in links
]
# concatenate the two sets
# Filter out invalid URLs
valid_links = {
link for link in absolute_links
if self.is_valid_url(URL
(link))
}
# Save raw HTML
# extract just the path from the url
path = url.split(self.base_domain)[1]
raw_filepath = self.data_dir / 'raw' / path.replace("/", "_").replace("_docs_apps_build_", "")
raw_filepath.parent.mkdir(parents=True, exist_ok=True)
raw_filepath.write_text(html)
# raw_filepath.write_text(self.strip_all_html_tags_from_markdown(html))
# Update sitemap
self.sitemap[url] = list(valid_links)
return valid_links
async def fetch_page(self, url: str) -> None:
"""Fetch and process a single page."""
if url in self.visited_urls:
return
self.visited_urls.add(url)
try:
async with self.semaphore:
async with self.session.get(url) as response:
if response.status == 200:
html = await response.text()
new_urls = await self.process_page(url, html)
for new_url in new_urls:
if new_url not in self.visited_urls:
await self.url_queue.put(new_url)
logger.info(f"Successfully processed: {url}")
else:
logger.warning(f"Failed to fetch {url}: {response.status}")
except Exception as e:
logger.error(f"Error processing {url}: {str(e)}")
def strip_all_html_tags_from_markdown(self, markdown: str) -> str:
"""Remove all HTML tags from a string, except for opening and closing script tags."""
# Define regex patterns to remove specific HTML tags
patterns = [
r'<div class="react-code-block" data-preset="file">\n',
r'<div class="react-code-block" data-preset="basic">\n',
r'<div class="react-code-block" data-preset="terminal">\n',
r'<div class="react-code-block-preload ThemeMode-dim">\n',
r'<div class="react-code-block-preload-bar "></div>\n',
r'<div class="react-code-block-preload-bar basic-codeblock">',
r'<div class="react-code-block-preload-placeholder-container">\n',
r'<div class="react-code-block-preload-code-container">\n',
r'<div class="react-code-block-preload-codeline-number"></div>\n',
r'<div class="react-code-block-preload-codeline"></div>\n',
r'<script data-option=[^>]+ data-value=[^>]+></script>\n',
r'<div>\n',
r'</div>\n',
r'<br>\n',
r'<p>\n',
r'</p>\n',
# r'<(?!script\b)[^>]+>',
# r'</(?!script\b)[^>]+>',
r'END_RAW_MD_CONTENT',
r'RAW_MD_CONTENT',
]
# Remove all matched patterns from the markdown
for pattern in patterns:
markdown = re.sub(pattern, '', markdown)
markdown = re.sub(r'<script type="text/plain"[^>]+language="([^"]+)"[^>]*>', r'```\1', markdown)
markdown = re.sub(r'</script>', '```', markdown)
# replace 3 or more new lines with 2 new lines
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
return markdown
def clean_raw_markdown(self):
"""Clean raw markdown files by stripping HTML tags."""
raw_dir = self.data_dir / 'raw'
for raw_file in raw_dir.glob('*.txt'):
content = raw_file.read_text()
cleaned_content = self.strip_all_html_tags_from_markdown(content)
raw_filepath = self.data_dir / 'clean' / raw_file.name
raw_filepath.parent.mkdir(parents=True, exist_ok=True)
raw_filepath.write_text(cleaned_content)
async def run(self):
"""Main crawler execution."""
# Create data directory
self.data_dir.mkdir(parents=True, exist_ok=True)
await self.init_session()
await self.url_queue.put(str(self.start_url))
try:
workers = []
while True:
if self.url_queue.empty() and not workers:
break
while not self.url_queue.empty():
url = await self.url_queue.get() + '.txt'
if url not in self.visited_urls:
worker = asyncio.create_task(self.fetch_page(url))
workers.append(worker)
if workers:
done, pending = await asyncio.wait(
workers,
return_when=asyncio.FIRST_COMPLETED
)
workers = list(pending)
for task in done:
await task
finally:
# Save sitemap
sitemap_path = self.data_dir / "_sitemap.json"
sitemap_path.write_text(json.dumps(self.sitemap, indent=2))
self.clean_raw_markdown()
await self.session.close()
logger.info(f"Crawl completed. Processed {len(self.visited_urls)} pages.")
async def main():
start_url = "https://shopify.dev/docs/apps/build/flow"
crawler = AsyncCrawler(start_url)
await crawler.run()
if __name__ == "__main__":
asyncio.run(main())
|