Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Commit
·
8b32433
1
Parent(s):
35d7624
inital version with small test for beautiful soup
Browse files- .gitignore +1 -0
- beautiful-soup/app.py +67 -0
- beautiful-soup/requirements.txt +1 -0
- beautiful-soup/test.py +38 -0
- google-search/requirements.txt +1 -0
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
/.venv
|
2 |
.env
|
|
|
|
1 |
/.venv
|
2 |
.env
|
3 |
+
__pycache__
|
beautiful-soup/app.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import requests
|
3 |
+
|
4 |
+
'''
|
5 |
+
- Error handing
|
6 |
+
- Look if alternative to main tag is needed. Provide error message if main tag is not found.
|
7 |
+
- Menus are li tags with a tags within.
|
8 |
+
- li tags with text and tags should be exported
|
9 |
+
- Find divs that have text or p tags maybe other tags like divs
|
10 |
+
- Export the text
|
11 |
+
'''
|
12 |
+
|
13 |
+
# Make request and get html content.
|
14 |
+
def get_soup( url ):
|
15 |
+
# try:
|
16 |
+
# request = requests.get(url)
|
17 |
+
# except:
|
18 |
+
# print('Unable to retrieve content, skipping URL')
|
19 |
+
# return
|
20 |
+
|
21 |
+
# if not request.ok:
|
22 |
+
# print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
|
23 |
+
# return
|
24 |
+
|
25 |
+
request = requests.get(url)
|
26 |
+
html = request.content
|
27 |
+
soup = BeautifulSoup(html, 'html.parser')
|
28 |
+
return soup
|
29 |
+
|
30 |
+
# Extract content from main tag.
|
31 |
+
def get_main( soup ):
|
32 |
+
return soup.main
|
33 |
+
|
34 |
+
def is_childless( tag ):
|
35 |
+
return len( tag.find_all('div') ) == 0
|
36 |
+
|
37 |
+
def get_divs( tag ):
|
38 |
+
# Get all the divs from within the main tag.
|
39 |
+
divs = tag.find_all('div')
|
40 |
+
return filter( is_childless, divs )
|
41 |
+
|
42 |
+
|
43 |
+
def extract_content( url ):
|
44 |
+
soup = get_soup( url )
|
45 |
+
main = get_main( soup )
|
46 |
+
divs = get_divs( main )
|
47 |
+
return [p.get_text() for p in div.find_all('p')]
|
48 |
+
|
49 |
+
|
50 |
+
# # Get all the divs from within the main tag.
|
51 |
+
# divs = soup.main.find_all('div')
|
52 |
+
# for div in divs:
|
53 |
+
# # Get all of the divs that do not have further divs within.
|
54 |
+
# no_child_div = len(div.find_all('div')) == 0
|
55 |
+
# if no_child_div:
|
56 |
+
# # Find all p tags in the div.
|
57 |
+
# content += [p.get_text() for p in div.find_all('p')]
|
58 |
+
# # Find all li in the div.
|
59 |
+
# for li in div.find_all('li'):
|
60 |
+
# #
|
61 |
+
# content += ''.join(li.find_all(text=True, recursive=False))
|
62 |
+
# content += ''.join(div.find_all(text=True, recursive=False))
|
63 |
+
# return content
|
64 |
+
|
65 |
+
if __name__ == '__main':
|
66 |
+
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|
67 |
+
print(extract_content(url))
|
beautiful-soup/requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
beautifulsoup4
|
beautiful-soup/test.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import app
|
4 |
+
|
5 |
+
class BeautifulSoupTest(unittest.TestCase):
|
6 |
+
def test_beautiful_soup(self):
|
7 |
+
self.assertTrue(True)
|
8 |
+
|
9 |
+
def test_main_tag(self):
|
10 |
+
html = '''
|
11 |
+
<html>
|
12 |
+
<head> </head>
|
13 |
+
<body>
|
14 |
+
<main>
|
15 |
+
<div>
|
16 |
+
<ul>
|
17 |
+
<li><a href="https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans">Electronic Billing</a></li>
|
18 |
+
<li><a href="https://www.cms.gov/Medicare/Billing/BillingFAQs">Billing FAQs</a></li>
|
19 |
+
</ul>
|
20 |
+
</div>
|
21 |
+
<div>
|
22 |
+
<div>
|
23 |
+
<p>Paragraph</p>
|
24 |
+
<ul>
|
25 |
+
<li>List Item</li>
|
26 |
+
</ul>
|
27 |
+
Text within div
|
28 |
+
</div>
|
29 |
+
</div>
|
30 |
+
</main>
|
31 |
+
</body>
|
32 |
+
</html>
|
33 |
+
'''
|
34 |
+
soup = BeautifulSoup(html, 'html.parser')
|
35 |
+
self.assertEqual( app.get_main( soup ).name, 'main' )
|
36 |
+
|
37 |
+
if __name__ == '__main__':
|
38 |
+
unittest.main()
|
google-search/requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
streamlit
|
2 |
google
|
3 |
python-dotenv
|
|
|
|
1 |
streamlit
|
2 |
google
|
3 |
python-dotenv
|
4 |
+
beautifulsoup4
|