Spaces:
Sleeping
Sleeping
Commit
·
2f05319
1
Parent(s):
ad98547
Fix strings caching
Browse filesadd body as last fallback
beautiful_soup/beautiful_soup.py
CHANGED
@@ -15,17 +15,17 @@ import requests
|
|
15 |
'''
|
16 |
|
17 |
def get_url_content( url ):
|
18 |
-
|
19 |
-
makedirs(dirname(
|
20 |
-
if exists(
|
21 |
-
with open(
|
22 |
strings = json.load( file )
|
23 |
else:
|
24 |
try:
|
25 |
strings = extract_strings( url )
|
26 |
except Exception as exception:
|
27 |
raise exception
|
28 |
-
with open(
|
29 |
json.dump( strings, file )
|
30 |
|
31 |
return strings
|
@@ -133,6 +133,11 @@ def get_main_content( soup ):
|
|
133 |
print('Has article tag.')
|
134 |
return content
|
135 |
|
|
|
|
|
|
|
|
|
|
|
136 |
return None
|
137 |
|
138 |
def get_tags_text( soup ):
|
|
|
15 |
'''
|
16 |
|
17 |
def get_url_content( url ):
|
18 |
+
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
|
19 |
+
makedirs(dirname(file_path), exist_ok=True)
|
20 |
+
if exists( file_path ):
|
21 |
+
with open( file_path, 'r' ) as file:
|
22 |
strings = json.load( file )
|
23 |
else:
|
24 |
try:
|
25 |
strings = extract_strings( url )
|
26 |
except Exception as exception:
|
27 |
raise exception
|
28 |
+
with open( file_path, 'w' ) as file:
|
29 |
json.dump( strings, file )
|
30 |
|
31 |
return strings
|
|
|
133 |
print('Has article tag.')
|
134 |
return content
|
135 |
|
136 |
+
content = soup.find( "body" )
|
137 |
+
if content is not None:
|
138 |
+
print('Has body tag.')
|
139 |
+
return content
|
140 |
+
|
141 |
return None
|
142 |
|
143 |
def get_tags_text( soup ):
|