Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

raccoon / beautiful-soup /app.py

grapplerulrich

inital version with small test for beautiful soup

8b32433 almost 3 years ago

raw

history blame

1.98 kB

	from bs4 import BeautifulSoup
	import requests

	'''
	- Error handing
	- Look if alternative to main tag is needed. Provide error message if main tag is not found.
	- Menus are li tags with a tags within.
	- li tags with text and tags should be exported
	- Find divs that have text or p tags maybe other tags like divs
	- Export the text
	'''

	# Make request and get html content.
	def get_soup( url ):
	# try:
	# request = requests.get(url)
	# except:
	# print('Unable to retrieve content, skipping URL')
	# return

	# if not request.ok:
	# print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
	# return

	request = requests.get(url)
	html = request.content
	soup = BeautifulSoup(html, 'html.parser')
	return soup

	# Extract content from main tag.
	def get_main( soup ):
	return soup.main

	def is_childless( tag ):
	return len( tag.find_all('div') ) == 0

	def get_divs( tag ):
	# Get all the divs from within the main tag.
	divs = tag.find_all('div')
	return filter( is_childless, divs )


	def extract_content( url ):
	soup = get_soup( url )
	main = get_main( soup )
	divs = get_divs( main )
	return [p.get_text() for p in div.find_all('p')]


	# # Get all the divs from within the main tag.
	# divs = soup.main.find_all('div')
	# for div in divs:
	# # Get all of the divs that do not have further divs within.
	# no_child_div = len(div.find_all('div')) == 0
	# if no_child_div:
	# # Find all p tags in the div.
	# content += [p.get_text() for p in div.find_all('p')]
	# # Find all li in the div.
	# for li in div.find_all('li'):
	# #
	# content += ''.join(li.find_all(text=True, recursive=False))
	# content += ''.join(div.find_all(text=True, recursive=False))
	# return content

	if __name__ == '__main':
	url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
	print(extract_content(url))