limitedonly41 commited on
Commit
4429406
·
verified ·
1 Parent(s): 84e17b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -2
app.py CHANGED
@@ -12,7 +12,7 @@ import asyncio
12
  from curl_cffi.requests import AsyncSession
13
  from tqdm.asyncio import tqdm
14
  from fake_headers import Headers
15
-
16
 
17
  # Limit the number of concurrent workers
18
  CONCURRENT_WORKERS = 5
@@ -35,6 +35,21 @@ tokenizer = None
35
 
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  async def get_page_bs4(url: str, headers):
40
 
@@ -187,7 +202,9 @@ def classify_website(url):
187
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
188
 
189
 
190
- urls = [url]
 
 
191
 
192
  final_ans_dict = {}
193
  print('before scrape_websites')
 
12
  from curl_cffi.requests import AsyncSession
13
  from tqdm.asyncio import tqdm
14
  from fake_headers import Headers
15
+ from urllib.parse import urlparse, urlunparse
16
 
17
  # Limit the number of concurrent workers
18
  CONCURRENT_WORKERS = 5
 
35
 
36
 
37
 
38
+ def get_main_page_url(url):
39
+
40
+ try:
41
+ # Parse the given URL
42
+ parsed_url = urlparse(url)
43
+
44
+ # Construct the main page URL (scheme + netloc)
45
+
46
+ print(parsed_url.netloc)
47
+ main_page_url = urlunparse((parsed_url.scheme, parsed_url.netloc, '', '', '', ''))
48
+
49
+ return main_page_url
50
+ except Exception as e:
51
+ return f"Error processing URL: {e}"
52
+
53
 
54
  async def get_page_bs4(url: str, headers):
55
 
 
202
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
203
 
204
 
205
+ main_page_url = get_main_page_url(url)
206
+
207
+ urls = [main_page_url]
208
 
209
  final_ans_dict = {}
210
  print('before scrape_websites')