File size: 7,713 Bytes
465a7e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python3
import asyncio
import logging
from pathlib import Path
from typing import Set, Dict
import aiohttp
from bs4 import BeautifulSoup
from yarl import URL
import json
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class AsyncCrawler:
    def __init__(self, start_url: str, max_concurrent: int = 100):
        self.start_url = URL(start_url)
        self.base_domain = self.start_url.host
        self.base_path = str(self.start_url).split(self.base_domain)[1]
        self.visited_urls: Set[str] = set()
        self.url_queue: asyncio.Queue = asyncio.Queue()
        self.semaphore = asyncio.Semaphore(max_concurrent)
        self.session: aiohttp.ClientSession = None
        self.data_dir = Path("data/scraped")
        self.sitemap: Dict[str, list] = {}

    async def init_session(self):
        """Initialize aiohttp session with optimal settings."""
        timeout = aiohttp.ClientTimeout(total=10)
        connector = aiohttp.TCPConnector(limit=100, ttl_dns_cache=300)
        self.session = aiohttp.ClientSession(
            timeout=timeout,
            connector=connector,
            headers={"User-Agent": "ShopBot/1.0"}
        )

    def is_valid_url(self, url: URL) -> bool:
        """Check if URL should be crawled."""
        return (
            str(url).startswith(str(self.start_url))
            and url.scheme in ("http", "https")
            and not url.fragment
        )
    
    async def process_page(self, url: str, html: str) -> Set[str]:
        """Extract links and save raw HTML."""
        
        # Regex pattern for Markdown links
        pattern = r'\[.*?\]\((https?://[^\)]+|/[^)]+|[^\)]+)\)'

        # Find all matches
        markdown_links = re.findall(pattern, html)

        soup = BeautifulSoup(html, 'html.parser')
        anchor_links = [a['href'] for a in soup.find_all('a', href=True)]

        links = markdown_links + anchor_links        
        absolute_links = [
            str(URL(link)) if URL(link).host else str(self.start_url.join(URL(link)))
            for link in links
        ]
        
        # concatenate the two sets

        # Filter out invalid URLs
        valid_links = {
            link for link in absolute_links
            if self.is_valid_url(URL
            (link))
        }
        
        # Save raw HTML
        # extract just the path from the url
        path = url.split(self.base_domain)[1]

        raw_filepath = self.data_dir / 'raw' / path.replace("/", "_").replace("_docs_apps_build_", "")
        raw_filepath.parent.mkdir(parents=True, exist_ok=True)

        raw_filepath.write_text(html)
        # raw_filepath.write_text(self.strip_all_html_tags_from_markdown(html))
        
        # Update sitemap
        self.sitemap[url] = list(valid_links)
        
        return valid_links

    async def fetch_page(self, url: str) -> None:
        """Fetch and process a single page."""
        if url in self.visited_urls:
            return

        self.visited_urls.add(url)
        
        try:
            async with self.semaphore:
                async with self.session.get(url) as response:
                    if response.status == 200:
                        html = await response.text()
                        new_urls = await self.process_page(url, html)
                        
                        for new_url in new_urls:
                            if new_url not in self.visited_urls:
                                await self.url_queue.put(new_url)
                        
                        logger.info(f"Successfully processed: {url}")
                    else:
                        logger.warning(f"Failed to fetch {url}: {response.status}")
        except Exception as e:
            logger.error(f"Error processing {url}: {str(e)}")
    
    def strip_all_html_tags_from_markdown(self, markdown: str) -> str:
        """Remove all HTML tags from a string, except for opening and closing script tags."""
        # Define regex patterns to remove specific HTML tags
        patterns = [
            r'<div class="react-code-block" data-preset="file">\n',
            r'<div class="react-code-block" data-preset="basic">\n',
            r'<div class="react-code-block" data-preset="terminal">\n',
            r'<div class="react-code-block-preload ThemeMode-dim">\n',
            r'<div class="react-code-block-preload-bar "></div>\n',
            r'<div class="react-code-block-preload-bar basic-codeblock">',
            r'<div class="react-code-block-preload-placeholder-container">\n',
            r'<div class="react-code-block-preload-code-container">\n',
            r'<div class="react-code-block-preload-codeline-number"></div>\n',
            r'<div class="react-code-block-preload-codeline"></div>\n',
            r'<script data-option=[^>]+ data-value=[^>]+></script>\n',
            r'<div>\n',
            r'</div>\n',
            r'<br>\n',
            r'<p>\n',
            r'</p>\n',
            # r'<(?!script\b)[^>]+>',
            # r'</(?!script\b)[^>]+>',
            r'END_RAW_MD_CONTENT',
            r'RAW_MD_CONTENT',
        ]

        # Remove all matched patterns from the markdown
        for pattern in patterns:
            markdown = re.sub(pattern, '', markdown)

        markdown = re.sub(r'<script type="text/plain"[^>]+language="([^"]+)"[^>]*>', r'```\1', markdown)
        markdown = re.sub(r'</script>', '```', markdown)

        # replace 3 or more new lines with 2 new lines
        markdown = re.sub(r'\n{3,}', '\n\n', markdown)
        
        return markdown
        
    def clean_raw_markdown(self):
        """Clean raw markdown files by stripping HTML tags."""        
        raw_dir = self.data_dir / 'raw'
        for raw_file in raw_dir.glob('*.txt'):
            content = raw_file.read_text()
            cleaned_content = self.strip_all_html_tags_from_markdown(content)
            
            raw_filepath = self.data_dir / 'clean' / raw_file.name
            raw_filepath.parent.mkdir(parents=True, exist_ok=True)
            raw_filepath.write_text(cleaned_content)

    async def run(self):
        """Main crawler execution."""
        # Create data directory
        self.data_dir.mkdir(parents=True, exist_ok=True)
        
        await self.init_session()
        await self.url_queue.put(str(self.start_url))
        
        try:
            workers = []
            while True:

                if self.url_queue.empty() and not workers:
                    break
                
                while not self.url_queue.empty():
                    url = await self.url_queue.get() + '.txt'

                    if url not in self.visited_urls:
                        worker = asyncio.create_task(self.fetch_page(url))
                        workers.append(worker)
                
                if workers:
                    done, pending = await asyncio.wait(
                        workers, 
                        return_when=asyncio.FIRST_COMPLETED
                    )
                    workers = list(pending)
                    for task in done:
                        await task
        finally:
            # Save sitemap
            sitemap_path = self.data_dir / "_sitemap.json"
            sitemap_path.write_text(json.dumps(self.sitemap, indent=2))
            self.clean_raw_markdown()
            
            await self.session.close()
            logger.info(f"Crawl completed. Processed {len(self.visited_urls)} pages.")

async def main():
    start_url = "https://shopify.dev/docs/apps/build/flow"
    crawler = AsyncCrawler(start_url)
    await crawler.run()

if __name__ == "__main__":
    asyncio.run(main())