Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
import requests | |
import uuid | |
from os.path import exists | |
''' | |
- Error handing | |
- Look if alternative to main tag is needed. Provide error message if main tag is not found. | |
- Menus are li tags with a tags within. | |
- li tags with text and tags should be exported | |
- Find divs that have text or p tags maybe other tags like divs | |
- Export the text | |
''' | |
# Make request and get html content. | |
def get_soup( url ): | |
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html' | |
if ( exists( file_path ) ): | |
with open( file_path, 'r' ) as web_page: | |
html = web_page.read() | |
else: | |
try: | |
request = requests.get(url) | |
except: | |
print('Unable to retrieve content, skipping URL') | |
return | |
if not request.ok: | |
print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) ) | |
return | |
if not request.content: | |
print(request.content) | |
return | |
html = request.content | |
with open( file_path, 'wb' ) as file: | |
file.write( html ) | |
return BeautifulSoup(html, 'html.parser') | |
# Extract content from main tag. | |
def get_main( soup ): | |
return soup.main | |
def get_deepest_divs( tag ): | |
# Get all the divs from within a tag. | |
return [div for div in tag.findAll('div') if not div.find('div')] | |
def get_tag_text( tags ): | |
text = '' | |
for tag in tags: | |
print(tag.find_all('li')) | |
# text += [p.get_text() for p in tag.find_all('p)] | |
return text | |
def get_list_text( tags ): | |
list_items = [] | |
for tag in tags: | |
list_items = tag.find_all(find_direct_text) | |
return list_items | |
def find_direct_text( tag ): | |
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' | |
def extract_content( url ): | |
soup = get_soup( url ) | |
if ( soup == None ): | |
return None | |
main = get_main( soup ) | |
if ( main == None ): | |
return 'No main tag found.' | |
return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )]) | |
if __name__ == '__main__': | |
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans' | |
print(extract_content(url)) | |