Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
import requests | |
''' | |
- Error handing | |
- Look if alternative to main tag is needed. Provide error message if main tag is not found. | |
- Menus are li tags with a tags within. | |
- li tags with text and tags should be exported | |
- Find divs that have text or p tags maybe other tags like divs | |
- Export the text | |
''' | |
# Make request and get html content. | |
def get_soup( url ): | |
# try: | |
# request = requests.get(url) | |
# except: | |
# print('Unable to retrieve content, skipping URL') | |
# return | |
# if not request.ok: | |
# print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code )) | |
# return | |
request = requests.get(url) | |
html = request.content | |
soup = BeautifulSoup(html, 'html.parser') | |
return soup | |
# Extract content from main tag. | |
def get_main( soup ): | |
return soup.main | |
def is_childless( tag ): | |
return len( tag.find_all('div') ) == 0 | |
def get_divs( tag ): | |
# Get all the divs from within the main tag. | |
divs = tag.find_all('div') | |
return filter( is_childless, divs ) | |
def extract_content( url ): | |
soup = get_soup( url ) | |
main = get_main( soup ) | |
divs = get_divs( main ) | |
return [p.get_text() for p in div.find_all('p')] | |
# # Get all the divs from within the main tag. | |
# divs = soup.main.find_all('div') | |
# for div in divs: | |
# # Get all of the divs that do not have further divs within. | |
# no_child_div = len(div.find_all('div')) == 0 | |
# if no_child_div: | |
# # Find all p tags in the div. | |
# content += [p.get_text() for p in div.find_all('p')] | |
# # Find all li in the div. | |
# for li in div.find_all('li'): | |
# # | |
# content += ''.join(li.find_all(text=True, recursive=False)) | |
# content += ''.join(div.find_all(text=True, recursive=False)) | |
# return content | |
if __name__ == '__main': | |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans' | |
print(extract_content(url)) | |