SoulofSukuna commited on
Commit
77c14b6
·
verified ·
1 Parent(s): 3cd63e0

Update helper/html_scraper.py

Browse files
Files changed (1) hide show
  1. helper/html_scraper.py +43 -19
helper/html_scraper.py CHANGED
@@ -1,19 +1,43 @@
1
- import os
2
- import asyncio
3
- from .asyncioPoliciesFix import decorator_asyncio_fix
4
- from constants.headers import HEADER_AIO
5
-
6
- HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
7
-
8
-
9
- class Scraper:
10
- @decorator_asyncio_fix
11
- async def _get_html(self, session, url):
12
- try:
13
- async with session.get(url, headers=HEADER_AIO, proxy=HTTP_PROXY) as r:
14
- return await r.text()
15
- except:
16
- return None
17
-
18
- async def get_all_results(self, session, url):
19
- return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cloudscraper
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ import os
4
+ import asyncio
5
+ from .asyncioPoliciesFix import decorator_asyncio_fix
6
+ from constants.headers import HEADER_AIO
7
+
8
+ HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
9
+
10
+
11
+ class Scraper:
12
+ @decorator_asyncio_fix
13
+ async def _get_html(self, session, url):
14
+ try:
15
+ async with session.get(url, headers=HEADER_AIO, proxy=HTTP_PROXY) as r:
16
+ return await r.text()
17
+ except:
18
+ return None
19
+
20
+ async def get_all_results(self, session, url):
21
+ return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))
22
+
23
+
24
+ class AsyncCloudscraper:
25
+ def __init__(self):
26
+ self.scraper = cloudscraper.create_scraper()
27
+
28
+ @decorator_asyncio_fix
29
+ async def _get_html(self, url):
30
+ loop = asyncio.get_running_loop()
31
+ return await loop.run_in_executor(None, self._sync_scrape, url)
32
+
33
+ def _sync_scrape(self, url):
34
+ try:
35
+ response = self.scraper.get(url, headers=HEADER_AIO)
36
+ return response.text
37
+ except Exception as e:
38
+ print(f"Error occurred while fetching {url}: {e}")
39
+ return None
40
+
41
+ async def get_all_results(self, urls):
42
+ tasks = [asyncio.create_task(self._get_html(url)) for url in urls]
43
+ return await asyncio.gather(*tasks)