Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

raccoon / beautiful_soup /app.py

grapplerulrich

Add caching and save search results url and HTML

151c2dd almost 3 years ago

raw

history blame

2.26 kB

	from bs4 import BeautifulSoup
	import requests
	import uuid
	from os.path import exists

	'''
	- Error handing
	- Look if alternative to main tag is needed. Provide error message if main tag is not found.
	- Menus are li tags with a tags within.
	- li tags with text and tags should be exported
	- Find divs that have text or p tags maybe other tags like divs
	- Export the text
	'''

	# Make request and get html content.
	def get_soup( url ):


	file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
	if ( exists( file_path ) ):
	with open( file_path, 'r' ) as web_page:
	html = web_page.read()
	else:
	try:
	request = requests.get(url)
	except:
	print('Unable to retrieve content, skipping URL')
	return
	if not request.ok:
	print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) )
	return
	if not request.content:
	print(request.content)
	return
	html = request.content
	with open( file_path, 'wb' ) as file:
	file.write( html )

	return BeautifulSoup(html, 'html.parser')

	# Extract content from main tag.
	def get_main( soup ):
	return soup.main

	def get_deepest_divs( tag ):
	# Get all the divs from within a tag.
	return [div for div in tag.findAll('div') if not div.find('div')]

	def get_tag_text( tags ):
	text = ''
	for tag in tags:
	print(tag.find_all('li'))
	# text += [p.get_text() for p in tag.find_all('p)]
	return text

	def get_list_text( tags ):
	list_items = []
	for tag in tags:
	list_items = tag.find_all(find_direct_text)
	return list_items

	def find_direct_text( tag ):
	return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3'

	def extract_content( url ):
	soup = get_soup( url )
	if ( soup == None ):
	return None
	main = get_main( soup )
	if ( main == None ):
	return 'No main tag found.'
	return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])

	if __name__ == '__main__':
	url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
	print(extract_content(url))