Spaces:
Runtime error
Runtime error
Commit
·
0125da1
1
Parent(s):
5505694
implement a cut off
Browse files- scrape_website.py +4 -3
scrape_website.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
|
|
|
4 |
|
5 |
def process_webpage(url:str):
|
6 |
# A set to keep track of visited pages
|
@@ -40,11 +41,11 @@ def process_webpage(url:str):
|
|
40 |
|
41 |
# make main page as first item
|
42 |
text_list.reverse()
|
43 |
-
|
44 |
-
page_content = "\n".join(
|
45 |
# Print the text content of the landing page and all child pages
|
46 |
print(page_content)
|
47 |
-
return
|
48 |
|
49 |
|
50 |
if __name__ == '__main__':
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
|
4 |
+
TOKEN_CUT_OFF = 2500
|
5 |
|
6 |
def process_webpage(url:str):
|
7 |
# A set to keep track of visited pages
|
|
|
41 |
|
42 |
# make main page as first item
|
43 |
text_list.reverse()
|
44 |
+
text_list_cut_off = text_list[:TOKEN_CUT_OFF]
|
45 |
+
page_content = "\n".join(text_list_cut_off)
|
46 |
# Print the text content of the landing page and all child pages
|
47 |
print(page_content)
|
48 |
+
return
|
49 |
|
50 |
|
51 |
if __name__ == '__main__':
|