Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Commit
·
151c2dd
1
Parent(s):
152531a
Add caching and save search results url and HTML
Browse files- .gitignore +2 -0
- beautiful_soup/app.py +46 -37
- beautiful_soup/test.py +104 -8
- main.py +59 -0
- requirements.txt +5 -90
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
/.venv
|
2 |
.env
|
3 |
__pycache__
|
|
|
|
|
|
1 |
/.venv
|
2 |
.env
|
3 |
__pycache__
|
4 |
+
/search-urls
|
5 |
+
/web-pages
|
beautiful_soup/app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
from bs4 import BeautifulSoup
|
2 |
import requests
|
|
|
|
|
3 |
|
4 |
'''
|
5 |
- Error handing
|
@@ -12,56 +14,63 @@ import requests
|
|
12 |
|
13 |
# Make request and get html content.
|
14 |
def get_soup( url ):
|
15 |
-
# try:
|
16 |
-
# request = requests.get(url)
|
17 |
-
# except:
|
18 |
-
# print('Unable to retrieve content, skipping URL')
|
19 |
-
# return
|
20 |
|
21 |
-
# if not request.ok:
|
22 |
-
# print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
|
23 |
-
# return
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Extract content from main tag.
|
31 |
def get_main( soup ):
|
32 |
return soup.main
|
33 |
|
34 |
-
def
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
def
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
41 |
|
|
|
|
|
42 |
|
43 |
def extract_content( url ):
|
44 |
soup = get_soup( url )
|
|
|
|
|
45 |
main = get_main( soup )
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
# # Get all the divs from within the main tag.
|
51 |
-
# divs = soup.main.find_all('div')
|
52 |
-
# for div in divs:
|
53 |
-
# # Get all of the divs that do not have further divs within.
|
54 |
-
# no_child_div = len(div.find_all('div')) == 0
|
55 |
-
# if no_child_div:
|
56 |
-
# # Find all p tags in the div.
|
57 |
-
# content += [p.get_text() for p in div.find_all('p')]
|
58 |
-
# # Find all li in the div.
|
59 |
-
# for li in div.find_all('li'):
|
60 |
-
# #
|
61 |
-
# content += ''.join(li.find_all(text=True, recursive=False))
|
62 |
-
# content += ''.join(div.find_all(text=True, recursive=False))
|
63 |
-
# return content
|
64 |
|
65 |
-
if __name__ == '
|
66 |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|
67 |
print(extract_content(url))
|
|
|
1 |
from bs4 import BeautifulSoup
|
2 |
import requests
|
3 |
+
import uuid
|
4 |
+
from os.path import exists
|
5 |
|
6 |
'''
|
7 |
- Error handing
|
|
|
14 |
|
15 |
# Make request and get html content.
|
16 |
def get_soup( url ):
|
|
|
|
|
|
|
|
|
|
|
17 |
|
|
|
|
|
|
|
18 |
|
19 |
+
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
|
20 |
+
if ( exists( file_path ) ):
|
21 |
+
with open( file_path, 'r' ) as web_page:
|
22 |
+
html = web_page.read()
|
23 |
+
else:
|
24 |
+
try:
|
25 |
+
request = requests.get(url)
|
26 |
+
except:
|
27 |
+
print('Unable to retrieve content, skipping URL')
|
28 |
+
return
|
29 |
+
if not request.ok:
|
30 |
+
print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) )
|
31 |
+
return
|
32 |
+
if not request.content:
|
33 |
+
print(request.content)
|
34 |
+
return
|
35 |
+
html = request.content
|
36 |
+
with open( file_path, 'wb' ) as file:
|
37 |
+
file.write( html )
|
38 |
+
|
39 |
+
return BeautifulSoup(html, 'html.parser')
|
40 |
|
41 |
# Extract content from main tag.
|
42 |
def get_main( soup ):
|
43 |
return soup.main
|
44 |
|
45 |
+
def get_deepest_divs( tag ):
|
46 |
+
# Get all the divs from within a tag.
|
47 |
+
return [div for div in tag.findAll('div') if not div.find('div')]
|
48 |
+
|
49 |
+
def get_tag_text( tags ):
|
50 |
+
text = ''
|
51 |
+
for tag in tags:
|
52 |
+
print(tag.find_all('li'))
|
53 |
+
# text += [p.get_text() for p in tag.find_all('p)]
|
54 |
+
return text
|
55 |
|
56 |
+
def get_list_text( tags ):
|
57 |
+
list_items = []
|
58 |
+
for tag in tags:
|
59 |
+
list_items = tag.find_all(find_direct_text)
|
60 |
+
return list_items
|
61 |
|
62 |
+
def find_direct_text( tag ):
|
63 |
+
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3'
|
64 |
|
65 |
def extract_content( url ):
|
66 |
soup = get_soup( url )
|
67 |
+
if ( soup == None ):
|
68 |
+
return None
|
69 |
main = get_main( soup )
|
70 |
+
if ( main == None ):
|
71 |
+
return 'No main tag found.'
|
72 |
+
return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
+
if __name__ == '__main__':
|
75 |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|
76 |
print(extract_content(url))
|
beautiful_soup/test.py
CHANGED
@@ -1,15 +1,13 @@
|
|
1 |
import unittest
|
2 |
from bs4 import BeautifulSoup
|
3 |
-
import
|
4 |
|
5 |
class BeautifulSoupTest(unittest.TestCase):
|
6 |
-
def test_beautiful_soup(self):
|
7 |
-
self.assertTrue(True)
|
8 |
|
9 |
-
def
|
10 |
-
html = '''
|
11 |
<html>
|
12 |
-
<head
|
13 |
<body>
|
14 |
<main>
|
15 |
<div>
|
@@ -31,8 +29,106 @@ class BeautifulSoupTest(unittest.TestCase):
|
|
31 |
</body>
|
32 |
</html>
|
33 |
'''
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
if __name__ == '__main__':
|
38 |
unittest.main()
|
|
|
1 |
import unittest
|
2 |
from bs4 import BeautifulSoup
|
3 |
+
import beautiful_soup
|
4 |
|
5 |
class BeautifulSoupTest(unittest.TestCase):
|
|
|
|
|
6 |
|
7 |
+
def setUp(self):
|
8 |
+
self.html = '''
|
9 |
<html>
|
10 |
+
<head></head>
|
11 |
<body>
|
12 |
<main>
|
13 |
<div>
|
|
|
29 |
</body>
|
30 |
</html>
|
31 |
'''
|
32 |
+
|
33 |
+
def test_main_tag(self):
|
34 |
+
soup = BeautifulSoup( self.html, 'html.parser' )
|
35 |
+
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
|
36 |
+
|
37 |
+
soup = BeautifulSoup( "", 'html.parser' )
|
38 |
+
self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
|
39 |
+
|
40 |
+
def test_has_no_div_childre(self):
|
41 |
+
childless = '''
|
42 |
+
<html>
|
43 |
+
<body>
|
44 |
+
<div><p>Text in div.</p></div>
|
45 |
+
</body>
|
46 |
+
</html>
|
47 |
+
'''
|
48 |
+
soup = BeautifulSoup( childless, 'html.parser' )
|
49 |
+
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body ) )
|
50 |
+
# self.assertTrue( beautiful_soup.has_no_div_children( soup.body.div ) )
|
51 |
+
|
52 |
+
nested_div = '''
|
53 |
+
<html>
|
54 |
+
<body>
|
55 |
+
<div>
|
56 |
+
<div>Text in paragraph.</div>
|
57 |
+
</div>
|
58 |
+
</body>
|
59 |
+
</html>
|
60 |
+
'''
|
61 |
+
soup = BeautifulSoup( nested_div, 'html.parser' )
|
62 |
+
# self.assertFalse( beautiful_soup.has_no_div_children( soup.body.div ) )
|
63 |
+
|
64 |
+
def test_get_deepest_divs(self):
|
65 |
+
nested_div = '''
|
66 |
+
<html>
|
67 |
+
<body>
|
68 |
+
<div>
|
69 |
+
<div><p>Text in paragraph.</p></div>
|
70 |
+
</div>
|
71 |
+
</body>
|
72 |
+
</html>
|
73 |
+
'''
|
74 |
+
soup = BeautifulSoup( nested_div, 'html.parser' )
|
75 |
+
self.assertEqual( beautiful_soup.get_deepest_divs( soup.body )[0].text, 'Text in paragraph.' )
|
76 |
+
|
77 |
+
|
78 |
+
def test_list(self):
|
79 |
+
nested_div = '''
|
80 |
+
<html>
|
81 |
+
<body>
|
82 |
+
<div>
|
83 |
+
<ul>
|
84 |
+
<li>Text in list.</li>
|
85 |
+
<li><a href"">Link in list.</a></li>
|
86 |
+
<li>Text with <a href"">Link</a> in list.</li>
|
87 |
+
</ul>
|
88 |
+
</div>
|
89 |
+
</body>
|
90 |
+
</html>
|
91 |
+
'''
|
92 |
+
soup = BeautifulSoup( nested_div, 'html.parser' )
|
93 |
+
divs = beautiful_soup.get_deepest_divs( soup.body )
|
94 |
+
# self.assertEqual( beautiful_soup.get_list_text( divs )[0], 'Text in list.' )
|
95 |
+
|
96 |
+
def test_exlcude_links(self):
|
97 |
+
nested_div = '''
|
98 |
+
<li><a href='somelink'>I DONT WANT THIS</a></li>
|
99 |
+
<li>blablalba <a href='both'>I WANT THIS</a> blalba</li>
|
100 |
+
<li><a href='right'>I WANT THIS</a> blalba</li>
|
101 |
+
<li>blablalba <a href='left'>I WANT THIS</a></li>
|
102 |
+
|
103 |
+
<p><a href='somelink'>I WANT THIS</a></p>
|
104 |
+
<p>blablalba <a href='both'>I WANT THIS</a> blalba</p>
|
105 |
+
<p><a href='right'>I WANT THIS</a> blalba</p>
|
106 |
+
<p>blablalba <a href='left'>I WANT THIS</a></p>
|
107 |
+
'''
|
108 |
+
soup = BeautifulSoup( nested_div, 'html.parser' )
|
109 |
+
|
110 |
+
list_items = soup.find_all(beautiful_soup.find_direct_text)
|
111 |
+
results = [
|
112 |
+
'blablalba I WANT THIS blalba',
|
113 |
+
'I WANT THIS blalba',
|
114 |
+
'blablalba I WANT THIS',
|
115 |
+
'I WANT THIS',
|
116 |
+
'blablalba I WANT THIS blalba',
|
117 |
+
'I WANT THIS blalba',
|
118 |
+
'blablalba I WANT THIS'
|
119 |
+
]
|
120 |
+
|
121 |
+
print(list_items)
|
122 |
+
# for item in list_items:
|
123 |
+
# print('item.get_text(): ' + item.get_text())
|
124 |
+
|
125 |
+
# help(list_items)
|
126 |
+
for i, item in enumerate(list_items):
|
127 |
+
self.assertEqual( item.get_text(), results[i] )
|
128 |
+
|
129 |
+
# self.assertEqual( list_items[0].get_text(), 'blablalba I WANT THIS blalba' )
|
130 |
+
# self.assertEqual( list_items[1].get_text(), 'I WANT THI Sblalba' )
|
131 |
+
# self.assertEqual( list_items[2].get_text(), 'blablalba I WANT THIS' )
|
132 |
|
133 |
if __name__ == '__main__':
|
134 |
unittest.main()
|
main.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from googleapiclient.discovery import build
|
4 |
+
from functools import cache
|
5 |
+
from slugify import slugify
|
6 |
+
from os import getenv
|
7 |
+
from os.path import exists
|
8 |
+
import json
|
9 |
+
|
10 |
+
from beautiful_soup.app import extract_content
|
11 |
+
|
12 |
+
@cache
|
13 |
+
def google_search( query ):
|
14 |
+
api_key = getenv('GOOGLE_SEARCH_API_KEY')
|
15 |
+
# cx = os.getenv('GOOGLE_SEARCH_ENGIN_ID')
|
16 |
+
service = build(
|
17 |
+
"customsearch",
|
18 |
+
"v1",
|
19 |
+
developerKey=api_key,
|
20 |
+
cache_discovery=False
|
21 |
+
)
|
22 |
+
|
23 |
+
return service.cse().list(
|
24 |
+
q=query,
|
25 |
+
cx='05048cc2df6134a06',
|
26 |
+
).execute()
|
27 |
+
|
28 |
+
def main():
|
29 |
+
load_dotenv()
|
30 |
+
st.title('Google Search')
|
31 |
+
query = st.text_input('Search query')
|
32 |
+
|
33 |
+
if ( query ):
|
34 |
+
file_path = 'search-urls/' + slugify( query ) + '.json'
|
35 |
+
|
36 |
+
if ( exists( file_path ) ):
|
37 |
+
with open( file_path, 'r' ) as results_file:
|
38 |
+
results = json.load(results_file)
|
39 |
+
else:
|
40 |
+
search_result = google_search( query )
|
41 |
+
if( int( search_result['searchInformation']['totalResults'] ) > 0 ):
|
42 |
+
results = search_result['items']
|
43 |
+
with open( file_path, 'w' ) as results_file:
|
44 |
+
json.dump( results, results_file )
|
45 |
+
else:
|
46 |
+
results = []
|
47 |
+
|
48 |
+
if ( len( results ) == 0 ) :
|
49 |
+
st.write( 'No results found.' )
|
50 |
+
|
51 |
+
try:
|
52 |
+
for item in results:
|
53 |
+
st.write(item['link'])
|
54 |
+
st.write(extract_content( item['link'] ))
|
55 |
+
except Exception as e:
|
56 |
+
st.exception(e)
|
57 |
+
|
58 |
+
if __name__ == '__main__':
|
59 |
+
main()
|
requirements.txt
CHANGED
@@ -1,90 +1,5 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
attrs==21.4.0
|
7 |
-
backcall==0.2.0
|
8 |
-
beautifulsoup4==4.10.0
|
9 |
-
bleach==4.1.0
|
10 |
-
blinker==1.4
|
11 |
-
cachetools==5.0.0
|
12 |
-
certifi==2021.10.8
|
13 |
-
cffi==1.15.0
|
14 |
-
charset-normalizer==2.0.12
|
15 |
-
click==8.0.4
|
16 |
-
debugpy==1.6.0
|
17 |
-
decorator==5.1.1
|
18 |
-
defusedxml==0.7.1
|
19 |
-
entrypoints==0.4
|
20 |
-
executing==0.8.3
|
21 |
-
gitdb==4.0.9
|
22 |
-
GitPython==3.1.27
|
23 |
-
idna==3.3
|
24 |
-
importlib-metadata==4.11.3
|
25 |
-
ipykernel==6.11.0
|
26 |
-
ipython==8.2.0
|
27 |
-
ipython-genutils==0.2.0
|
28 |
-
ipywidgets==7.7.0
|
29 |
-
jedi==0.18.1
|
30 |
-
Jinja2==3.1.1
|
31 |
-
jsonschema==4.4.0
|
32 |
-
jupyter-client==7.2.1
|
33 |
-
jupyter-core==4.9.2
|
34 |
-
jupyterlab-pygments==0.1.2
|
35 |
-
jupyterlab-widgets==1.1.0
|
36 |
-
MarkupSafe==2.1.1
|
37 |
-
matplotlib-inline==0.1.3
|
38 |
-
mistune==0.8.4
|
39 |
-
nbclient==0.5.13
|
40 |
-
nbconvert==6.4.5
|
41 |
-
nbformat==5.2.0
|
42 |
-
nest-asyncio==1.5.4
|
43 |
-
notebook==6.4.10
|
44 |
-
numpy==1.22.3
|
45 |
-
packaging==21.3
|
46 |
-
pandas==1.4.1
|
47 |
-
pandocfilters==1.5.0
|
48 |
-
parso==0.8.3
|
49 |
-
pexpect==4.8.0
|
50 |
-
pickleshare==0.7.5
|
51 |
-
Pillow==9.0.1
|
52 |
-
prometheus-client==0.13.1
|
53 |
-
prompt-toolkit==3.0.28
|
54 |
-
protobuf==3.19.4
|
55 |
-
psutil==5.9.0
|
56 |
-
ptyprocess==0.7.0
|
57 |
-
pure-eval==0.2.2
|
58 |
-
pyarrow==7.0.0
|
59 |
-
pycparser==2.21
|
60 |
-
pydeck==0.7.1
|
61 |
-
Pygments==2.11.2
|
62 |
-
Pympler==1.0.1
|
63 |
-
pyparsing==3.0.7
|
64 |
-
pyrsistent==0.18.1
|
65 |
-
python-dateutil==2.8.2
|
66 |
-
pytz==2022.1
|
67 |
-
pytz-deprecation-shim==0.1.0.post0
|
68 |
-
pyzmq==22.3.0
|
69 |
-
requests==2.27.1
|
70 |
-
semver==2.13.0
|
71 |
-
Send2Trash==1.8.0
|
72 |
-
six==1.16.0
|
73 |
-
smmap==5.0.0
|
74 |
-
soupsieve==2.3.1
|
75 |
-
stack-data==0.2.0
|
76 |
-
streamlit==1.8.1
|
77 |
-
terminado==0.13.3
|
78 |
-
testpath==0.6.0
|
79 |
-
toml==0.10.2
|
80 |
-
toolz==0.11.2
|
81 |
-
tornado==6.1
|
82 |
-
traitlets==5.1.1
|
83 |
-
tzdata==2022.1
|
84 |
-
tzlocal==4.1
|
85 |
-
urllib3==1.26.9
|
86 |
-
validators==0.18.2
|
87 |
-
wcwidth==0.2.5
|
88 |
-
webencodings==0.5.1
|
89 |
-
widgetsnbextension==3.6.0
|
90 |
-
zipp==3.7.0
|
|
|
1 |
+
streamlit
|
2 |
+
google
|
3 |
+
python-dotenv
|
4 |
+
beautifulsoup4
|
5 |
+
python-slugify
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|