Spaces:

SoulofSukuna
/

Tor-Search-Api

Sleeping

SoulofSukuna commited on Sep 5, 2024

Commit

77c14b6

verified ·

1 Parent(s): 3cd63e0

Update helper/html_scraper.py

Files changed (1) hide show

helper/html_scraper.py CHANGED Viewed

@@ -1,19 +1,43 @@
-import os
-import asyncio
-from .asyncioPoliciesFix import decorator_asyncio_fix
-from constants.headers import HEADER_AIO
-HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
-class Scraper:
-    @decorator_asyncio_fix
-    async def _get_html(self, session, url):
-        try:
-            async with session.get(url, headers=HEADER_AIO, proxy=HTTP_PROXY) as r:
-                return await r.text()
-        except:
-            return None
-    async def get_all_results(self, session, url):
-        return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))

+import cloudscraper
+from concurrent.futures import ThreadPoolExecutor
+import os
+import asyncio
+from .asyncioPoliciesFix import decorator_asyncio_fix
+from constants.headers import HEADER_AIO
+HTTP_PROXY = os.environ.get("HTTP_PROXY", None)
+class Scraper:
+    @decorator_asyncio_fix
+    async def _get_html(self, session, url):
+        try:
+            async with session.get(url, headers=HEADER_AIO, proxy=HTTP_PROXY) as r:
+                return await r.text()
+        except:
+            return None
+    async def get_all_results(self, session, url):
+        return await asyncio.gather(asyncio.create_task(self._get_html(session, url)))
+class AsyncCloudscraper:
+    def __init__(self):
+        self.scraper = cloudscraper.create_scraper()
+    @decorator_asyncio_fix
+    async def _get_html(self, url):
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self._sync_scrape, url)
+    def _sync_scrape(self, url):
+        try:
+            response = self.scraper.get(url, headers=HEADER_AIO)
+            return response.text
+        except Exception as e:
+            print(f"Error occurred while fetching {url}: {e}")
+            return None
+    async def get_all_results(self, urls):
+        tasks = [asyncio.create_task(self._get_html(url)) for url in urls]
+        return await asyncio.gather(*tasks)