Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Commit
·
526644d
1
Parent(s):
37ee6a5
Cache page content
Browse files- .gitignore +1 -0
- beautiful_soup/app.py +15 -0
- main.py +2 -2
.gitignore
CHANGED
@@ -3,3 +3,4 @@
|
|
3 |
__pycache__
|
4 |
/search-results
|
5 |
/web-pages
|
|
|
|
3 |
__pycache__
|
4 |
/search-results
|
5 |
/web-pages
|
6 |
+
/page-content
|
beautiful_soup/app.py
CHANGED
@@ -130,6 +130,21 @@ def extract_content( url ):
|
|
130 |
return get_tags_text( content )
|
131 |
# return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
if __name__ == '__main__':
|
134 |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|
135 |
print(extract_content(url))
|
|
|
130 |
return get_tags_text( content )
|
131 |
# return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
|
132 |
|
133 |
+
def get_url_content( url ):
|
134 |
+
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
|
135 |
+
if exists( file_path ):
|
136 |
+
with open( file_path, 'r' ) as file_content:
|
137 |
+
content = file_content.read()
|
138 |
+
else:
|
139 |
+
try:
|
140 |
+
content = extract_content( url )
|
141 |
+
except Exception as exception:
|
142 |
+
raise exception
|
143 |
+
with open( file_path, 'w' ) as file:
|
144 |
+
file.write( content )
|
145 |
+
|
146 |
+
return content
|
147 |
+
|
148 |
if __name__ == '__main__':
|
149 |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|
150 |
print(extract_content(url))
|
main.py
CHANGED
@@ -8,7 +8,7 @@ from dotenv import load_dotenv
|
|
8 |
from googleapiclient.discovery import build
|
9 |
from slugify import slugify
|
10 |
|
11 |
-
from beautiful_soup.app import
|
12 |
|
13 |
@cache
|
14 |
def google_search_api_request( query ):
|
@@ -60,7 +60,7 @@ def main():
|
|
60 |
for result in results:
|
61 |
st.write(result['link'])
|
62 |
try:
|
63 |
-
st.write(
|
64 |
except Exception as exception:
|
65 |
st.exception(exception)
|
66 |
|
|
|
8 |
from googleapiclient.discovery import build
|
9 |
from slugify import slugify
|
10 |
|
11 |
+
from beautiful_soup.app import get_url_content
|
12 |
|
13 |
@cache
|
14 |
def google_search_api_request( query ):
|
|
|
60 |
for result in results:
|
61 |
st.write(result['link'])
|
62 |
try:
|
63 |
+
st.write( get_url_content( result['link'] ) )
|
64 |
except Exception as exception:
|
65 |
st.exception(exception)
|
66 |
|