raccoon / beautiful_soup /beautiful_soup.py
grapplerulrich's picture
Save strings as json instead of text
history blame
5.59 kB
import uuid
import json
from os import makedirs, remove
from os.path import exists, dirname
from bs4 import BeautifulSoup
import requests
- Error handing
- Look if alternative to main tag is needed. Provide error message if main tag is not found.
- Menus are li tags with a tags within.
- li tags with text and tags should be exported
- Find divs that have text or p tags maybe other tags like divs
- Export the text
def get_url_content( url ):
file = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
makedirs(dirname(file), exist_ok=True)
if exists( file ):
with open( file, 'r' ) as file_content:
strings = json.load( file )
strings = extract_strings( url )
except Exception as exception:
raise exception
with open( file, 'w' ) as file:
json.dump( strings, file )
return strings
def extract_strings( url ):
try :
soup = get_soup( url )
except Exception as exception:
raise exception
if soup is None:
raise Exception('No HTML content found.')
# Remove scripts and styles.
for script in soup(["script", "style"]):
content = get_main_content( soup )
if content is None :
raise Exception('No main content found.')
strings = get_tags_text( content )
if strings is None :
raise Exception('No text found.')
return strings
# Make request and get html content.
def get_soup( url ):
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
makedirs(dirname(file_path), exist_ok=True)
if exists( file_path ):
with open( file_path, 'r' ) as web_page:
html = web_page.read()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
response = requests.get( url, headers=headers )
if not response.text:
raise Exception('HTML empty.')
html = response.text
with open( file_path, 'w' ) as file:
file.write( html )
return BeautifulSoup(html, 'html.parser')
def get_main_content( soup ):
content = soup.find( "div", { "class": "post-body" } )
if content is not None:
print('Has .post-body class.')
return content
content = soup.find( "div", { "class": "article-content" } )
if content is not None:
print('Has .article-content class.')
return content
content = soup.find( "div", { "class": "blog-post-content" } )
if content is not None:
print('Has .blog-post-content class.')
return content
content = soup.find( "div", { "class": "region-content" } )
if content is not None:
print('Has .region-content class.')
return content
content = soup.find( "div", { "class": "entry-content" } )
if content is not None:
print('Has .entry-content class.')
return content
content = soup.find( "div", { "class": "region--content" } )
if content is not None:
print('Has .region--content class.')
return content
content = soup.find( "div", { "class": "article" } )
if content is not None:
print('Has .article class.')
return content
content = soup.find( "div", { "class": "article-inner_html" } )
if content is not None:
print('Has .article-inner_html class.')
return content
content = soup.find( "div", { "id": "bmdDetail-Content" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.find( "div", { "id": "main" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.main
if content is not None:
print('Has main tag.')
return content
content = soup.find( "article" )
if content is not None:
print('Has article tag.')
return content
return None
def get_tags_text( soup ):
text = []
tags = soup.find_all( allowed_tags )
for tag in tags:
if tag.name == 'div' :
for div in tag.find_all(text=True, recursive=False):
found_text = div.get_text( ' ', strip=True )
if found_text != '':
text.append( found_text )
else :
text.append( tag.get_text( ' ', strip=True ))
return text
def allowed_tags( tag ):
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
# -------------------------------------- #
# Extract content from main tag.
def get_main( soup ):
return soup.main
def get_deepest_divs( tag ):
# Get all the divs from within a tag.
return [div for div in tag.findAll('div') if not div.find('div')]
def get_tag_text( tags ):
text = ''
for tag in tags:
# text += [p.get_text() for p in tag.find_all('p)]
return text
def get_list_text( tags ):
list_items = []
for tag in tags:
list_items = tag.find_all(find_direct_text)
return list_items
def find_div_text( tag ):
return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
if __name__ == '__main__':
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'