Spaces:
Sleeping
Sleeping
File size: 6,455 Bytes
151c2dd 1f86974 59d5e33 1f95777 8b32433 0227a07 561abab 2f05319 0227a07 2f05319 0227a07 2f05319 1f86974 561abab 1f86974 561abab 67c42e7 0227a07 2f05319 1f86974 561abab 1f86974 561abab 0227a07 1f86974 561abab 0227a07 561abab d834e4c 0227a07 561abab 0227a07 1f86974 ea2fb58 1f86974 561abab 8b32433 151c2dd 59d5e33 0227a07 1f95777 151c2dd 0227a07 1f95777 0227a07 1f95777 0227a07 1f95777 151c2dd 8b32433 0227a07 1ec143e d834e4c 1ec143e d834e4c 1ec143e 37ee6a5 561abab 37ee6a5 1ec143e 2f05319 1ec143e 0227a07 561abab 1f86974 0227a07 d834e4c 0227a07 561abab 0227a07 d834e4c 561abab d834e4c 6e5749a 1f86974 561abab 67c42e7 6e5749a 67c42e7 561abab 0227a07 d834e4c 0227a07 561abab 151c2dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import uuid
import json
from os import makedirs, remove
from os.path import exists, dirname
from bs4 import BeautifulSoup
import requests
'''
- Error handing
- Look if alternative to main tag is needed. Provide error message if main tag is not found.
- Menus are li tags with a tags within.
- li tags with text and tags should be exported
- Find divs that have text or p tags maybe other tags like divs
- Export the text
'''
# Get array of strings from page based off URL.
def get_url_content( url ):
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
# Create directory if it doesn't exist.
makedirs(dirname(file_path), exist_ok=True)
# If cache file exists get content from cache.
if exists( file_path ):
with open( file_path, 'r' ) as file:
strings = json.load( file )
else:
try:
strings = extract_strings( url )
except Exception as exception:
raise exception
# Write strings to cache.
with open( file_path, 'w' ) as file:
json.dump( strings, file )
return strings
# Extract text from page based off URL.
def extract_strings( url ):
try :
# Parse html content using BeautifulSoup.
soup = get_soup( url )
except Exception as exception:
raise exception
if soup is None:
raise Exception('No HTML content found.')
# Remove scripts and styles.
for script in soup(["script", "style"]):
script.decompose()
# Get main content of html page.
content = get_main_content( soup )
if content is None :
raise Exception('No main content found.')
# Extract strings from main content based on allowed tags.
strings = get_tags_text( content )
if strings is None :
raise Exception('No text found.')
return strings
# Make request and get html content.
def get_soup( url ):
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
makedirs(dirname(file_path), exist_ok=True)
# If cache file exists get content from cache.
if exists( file_path ):
with open( file_path, 'r' ) as web_page:
html = web_page.read()
else:
# Add user agent header to request to make request more realistic.
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
response = requests.get( url, headers=headers )
# Raise exception if response is not 200.
response.raise_for_status()
if not response.text:
raise Exception('HTML empty.')
html = response.text
# Save html to cache.
with open( file_path, 'w' ) as file:
file.write( html )
return BeautifulSoup(html, 'html.parser')
# Find main content of html page based rules.
def get_main_content( soup ):
content = soup.find( "div", { "class": "post-body" } )
if content is not None:
print('Has .post-body class.')
return content
content = soup.find( "div", { "class": "article-content" } )
if content is not None:
print('Has .article-content class.')
return content
content = soup.find( "div", { "class": "blog-post-content" } )
if content is not None:
print('Has .blog-post-content class.')
return content
content = soup.find( "div", { "class": "region-content" } )
if content is not None:
print('Has .region-content class.')
return content
content = soup.find( "div", { "class": "entry-content" } )
if content is not None:
print('Has .entry-content class.')
return content
content = soup.find( "div", { "class": "region--content" } )
if content is not None:
print('Has .region--content class.')
return content
content = soup.find( "div", { "class": "article" } )
if content is not None:
print('Has .article class.')
return content
content = soup.find( "div", { "class": "article-inner_html" } )
if content is not None:
print('Has .article-inner_html class.')
return content
content = soup.find( "div", { "id": "bmdDetail-Content" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.find( "div", { "id": "main" } )
if content is not None:
print('Has .bmdDetail-Content id.')
return content
content = soup.main
if content is not None:
print('Has main tag.')
return content
content = soup.find( "article" )
if content is not None:
print('Has article tag.')
return content
content = soup.find( "body" )
if content is not None:
print('Has body tag.')
return content
return None
# Extract text from allowed tags.
def get_tags_text( soup ):
text = []
# Find all tags that are allowed.
tags = soup.find_all( allowed_tags )
# Loop through tags and extract text.
for tag in tags:
# If div tag extract text from sub tags.
if tag.name == 'div' :
for div in tag.find_all(text=True, recursive=False):
found_text = div.get_text( ' ', strip=True )
if found_text != '':
found_text = found_text.replace( '\n', ' ' )
found_text = found_text.replace( '\r', ' ' )
text.append( found_text )
else :
found_text = tag.get_text( ' ', strip=True )
if found_text != '':
found_text = found_text.replace( '\n', ' ' )
found_text = found_text.replace( '\r', ' ' )
text.append( found_text )
return text
# List of allowed tags.
def allowed_tags( tag ):
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
## To be deleted.
# -------------------------------------- #
# Extract content from main tag.
def get_main( soup ):
return soup.main
def get_deepest_divs( tag ):
# Get all the divs from within a tag.
return [div for div in tag.findAll('div') if not div.find('div')]
def get_tag_text( tags ):
text = ''
for tag in tags:
print(tag.find_all('li'))
# text += [p.get_text() for p in tag.find_all('p)]
return text
|