Spaces:
Sleeping
Sleeping
File size: 4,990 Bytes
151c2dd 1f95777 8b32433 561abab ea2fb58 561abab ea2fb58 561abab 8b32433 151c2dd 1ec143e 1f95777 151c2dd 1f95777 151c2dd 8b32433 1ec143e 37ee6a5 561abab 37ee6a5 1ec143e 561abab 151c2dd 8b32433 151c2dd 8b32433 151c2dd 1f95777 151c2dd 8b32433 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import uuid
from os.path import exists
from bs4 import BeautifulSoup
import requests
'''
- Error handing
- Look if alternative to main tag is needed. Provide error message if main tag is not found.
- Menus are li tags with a tags within.
- li tags with text and tags should be exported
- Find divs that have text or p tags maybe other tags like divs
- Export the text
'''
def get_url_content( url ):
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
if exists( file_path ):
with open( file_path, 'r' ) as file_content:
content = file_content.read()
else:
try:
content = extract_content( url )
except Exception as exception:
raise exception
with open( file_path, 'w' ) as file:
file.write( content.strip() )
return content
def extract_content( url ):
try :
soup = get_soup( url )
except Exception as exception:
raise exception
if soup is None:
raise Exception('No HTML content found.')
content = get_main_content( soup )
if content is None :
raise Exception('No main content found.')
text = get_tags_text( content )
if text is None :
raise Exception('No text found.')
return text
# Make request and get html content.
def get_soup( url ):
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
print(file_path)
if exists( file_path ):
with open( file_path, 'r' ) as web_page:
html = web_page.read()
else:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
response = requests.get( url, headers=headers )
response.raise_for_status()
if not response.text:
raise Exception('HTML empty.')
html = response.text
with open( file_path, 'w' ) as file:
file.write( html )
return BeautifulSoup(html, 'html.parser')
def get_main_content( soup ):
content = soup.find( "div", { "class": "post-body" } )
if content is not None:
print('Has .post-body class.')
return content
content = soup.find( "div", { "class": "article-content" } )
if content is not None:
print('Has .article-content class.')
return content
content = soup.find( "div", { "class": "entry-content" } )
if content is not None:
print('Has .entry-content class.')
return content
content = soup.find( "div", { "class": "region--content" } )
if content is not None:
print('Has .region--content class.')
return content
content = soup.find( "div", { "class": "article" } )
if content is not None:
print('Has .article class.')
return content
content = soup.find( "div", { "class": "article-inner_html" } )
if content is not None:
print('Has .article-inner_html class.')
return content
content = soup.find( "div", { "id": "bmdDetail-Content" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.find( "div", { "id": "main" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.main
if content is not None:
print('Has main tag.')
return content
content = soup.find( "article" )
if content is not None:
print('Has article tag.')
return content
return None
def get_tags_text( soup ):
text = ''
tags = soup.find_all( find_direct_text )
for tag in tags:
if tag.name == 'div' and tag.find( text=True, recursive=False ) :
for div in tag.find_all(text=True, recursive=False):
text += div.get_text().strip() + ' '
else :
text += tag.get_text().strip() + ' '
return text
# -------------------------------------- #
# Extract content from main tag.
def get_main( soup ):
return soup.main
def get_deepest_divs( tag ):
# Get all the divs from within a tag.
return [div for div in tag.findAll('div') if not div.find('div')]
def get_tag_text( tags ):
text = ''
for tag in tags:
print(tag.find_all('li'))
# text += [p.get_text() for p in tag.find_all('p)]
return text
def get_list_text( tags ):
list_items = []
for tag in tags:
list_items = tag.find_all(find_direct_text)
return list_items
def find_direct_text( tag ):
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )
def find_div_text( tag ):
return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
if __name__ == '__main__':
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
print(extract_content(url))
|