Spaces:
Sleeping
Sleeping
File size: 1,984 Bytes
8b32433 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
from bs4 import BeautifulSoup
import requests
'''
- Error handing
- Look if alternative to main tag is needed. Provide error message if main tag is not found.
- Menus are li tags with a tags within.
- li tags with text and tags should be exported
- Find divs that have text or p tags maybe other tags like divs
- Export the text
'''
# Make request and get html content.
def get_soup( url ):
# try:
# request = requests.get(url)
# except:
# print('Unable to retrieve content, skipping URL')
# return
# if not request.ok:
# print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
# return
request = requests.get(url)
html = request.content
soup = BeautifulSoup(html, 'html.parser')
return soup
# Extract content from main tag.
def get_main( soup ):
return soup.main
def is_childless( tag ):
return len( tag.find_all('div') ) == 0
def get_divs( tag ):
# Get all the divs from within the main tag.
divs = tag.find_all('div')
return filter( is_childless, divs )
def extract_content( url ):
soup = get_soup( url )
main = get_main( soup )
divs = get_divs( main )
return [p.get_text() for p in div.find_all('p')]
# # Get all the divs from within the main tag.
# divs = soup.main.find_all('div')
# for div in divs:
# # Get all of the divs that do not have further divs within.
# no_child_div = len(div.find_all('div')) == 0
# if no_child_div:
# # Find all p tags in the div.
# content += [p.get_text() for p in div.find_all('p')]
# # Find all li in the div.
# for li in div.find_all('li'):
# #
# content += ''.join(li.find_all(text=True, recursive=False))
# content += ''.join(div.find_all(text=True, recursive=False))
# return content
if __name__ == '__main':
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
print(extract_content(url))
|