Spaces:
Sleeping
Sleeping
grapplerulrich
commited on
Add inline comments and fix batch summaries
Browse files- app.py +85 -41
- beautiful_soup/beautiful_soup.py +21 -13
app.py
CHANGED
@@ -19,12 +19,10 @@ def google_search_api_request( query ):
|
|
19 |
Request Google Search API with query and return results.
|
20 |
"""
|
21 |
|
22 |
-
api_key = st.secrets["google_search_api_key"]
|
23 |
-
cx = st.secrets["google_search_engine_id"]
|
24 |
service = build(
|
25 |
"customsearch",
|
26 |
"v1",
|
27 |
-
developerKey=
|
28 |
cache_discovery=False
|
29 |
)
|
30 |
|
@@ -33,7 +31,7 @@ def google_search_api_request( query ):
|
|
33 |
|
34 |
return service.cse().list(
|
35 |
q=query,
|
36 |
-
cx=
|
37 |
num=5,
|
38 |
lr='lang_en', # lang_de
|
39 |
fields='items(title,link),searchInformation(totalResults)'
|
@@ -46,15 +44,20 @@ def search_results( query ):
|
|
46 |
"""
|
47 |
file_path = 'search-results/' + slugify( query ) + '.json'
|
48 |
|
49 |
-
|
50 |
makedirs(dirname(file_path), exist_ok=True)
|
|
|
|
|
|
|
51 |
if exists( file_path ):
|
52 |
with open( file_path, 'r' ) as results_file:
|
53 |
results = json.load( results_file )
|
54 |
else:
|
55 |
search_result = google_search_api_request( query )
|
|
|
56 |
if int( search_result['searchInformation']['totalResults'] ) > 0:
|
57 |
results = search_result['items']
|
|
|
58 |
with open( file_path, 'w' ) as results_file:
|
59 |
json.dump( results, results_file )
|
60 |
|
@@ -63,15 +66,21 @@ def search_results( query ):
|
|
63 |
|
64 |
return results
|
65 |
|
66 |
-
def get_summary(
|
67 |
-
file_path = 'summaries/' +
|
|
|
|
|
68 |
makedirs(dirname(file_path), exist_ok=True)
|
|
|
|
|
69 |
if exists( file_path ):
|
70 |
with open( file_path, 'r' ) as file:
|
71 |
summary = json.load( file )
|
72 |
else:
|
|
|
|
|
73 |
summary = generate_summary( content )
|
74 |
-
|
75 |
with open( file_path, 'w' ) as file:
|
76 |
json.dump( summary, file )
|
77 |
|
@@ -95,11 +104,13 @@ def exception_notice( exception ):
|
|
95 |
Helper function for exception notices.
|
96 |
"""
|
97 |
query_params = st.experimental_get_query_params()
|
|
|
98 |
if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
|
99 |
st.exception(exception)
|
100 |
else:
|
101 |
st.warning(str(exception))
|
102 |
|
|
|
103 |
def is_keyword_in_string( keywords, string ):
|
104 |
"""
|
105 |
Checks if string contains keyword.
|
@@ -110,22 +121,29 @@ def is_keyword_in_string( keywords, string ):
|
|
110 |
return False
|
111 |
|
112 |
def filter_sentences_by_keywords( strings, keywords ):
|
|
|
|
|
|
|
113 |
nlp = spacy.load("en_core_web_sm")
|
114 |
matcher = PhraseMatcher(nlp.vocab)
|
115 |
-
|
116 |
-
|
|
|
117 |
matcher.add("QueryList", patterns)
|
118 |
|
119 |
sentences = []
|
120 |
for string in strings:
|
121 |
-
# Exclude
|
122 |
string_length = len( string.split(' ') )
|
123 |
if string_length < 5:
|
124 |
continue
|
|
|
|
|
125 |
doc = nlp(string)
|
126 |
for sentence in doc.sents:
|
127 |
matches = matcher(nlp(sentence.text))
|
128 |
for match_id, start, end in matches:
|
|
|
129 |
if nlp.vocab.strings[match_id] in ["QueryList"]:
|
130 |
sentences.append(sentence.text)
|
131 |
|
@@ -138,15 +156,19 @@ def split_content_into_chunks( sentences ):
|
|
138 |
chunk = ''
|
139 |
word_count = 0
|
140 |
chunks = []
|
|
|
141 |
for sentence in sentences:
|
142 |
-
|
143 |
-
|
|
|
|
|
144 |
st.write("Number of words(tokens): {}".format(word_count))
|
145 |
chunks.append(chunk)
|
146 |
-
chunk = ''
|
147 |
-
word_count = 0
|
148 |
|
149 |
-
|
|
|
150 |
chunk += sentence + ' '
|
151 |
|
152 |
st.write("Number of words(tokens): {}".format(word_count))
|
@@ -154,6 +176,41 @@ def split_content_into_chunks( sentences ):
|
|
154 |
|
155 |
return chunks
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
def main():
|
158 |
st.title('Racoon Search')
|
159 |
query = st.text_input('Search query')
|
@@ -167,9 +224,11 @@ def main():
|
|
167 |
exception_notice(exception)
|
168 |
return
|
169 |
|
|
|
170 |
number_of_results = len( results )
|
171 |
st.success( 'Found {} results for "{}".'.format( number_of_results, query ) )
|
172 |
|
|
|
173 |
if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
|
174 |
with st.expander("Search results JSON"):
|
175 |
if st.button('Delete search result cache', key=query + 'cache'):
|
@@ -185,37 +244,22 @@ def main():
|
|
185 |
for index, result in enumerate(results):
|
186 |
with st.container():
|
187 |
st.markdown('### ' + result['title'])
|
|
|
188 |
url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
|
189 |
-
try:
|
190 |
-
strings = get_url_content( result['link'] )
|
191 |
-
keywords = query.split(' ')
|
192 |
-
sentences = filter_sentences_by_keywords( strings, keywords )
|
193 |
-
chunks = split_content_into_chunks( sentences )
|
194 |
-
|
195 |
-
number_of_chunks = len( chunks )
|
196 |
-
if number_of_chunks > 1:
|
197 |
-
max_length = int( 512 / len( chunks ) )
|
198 |
-
st.write("Max length: {}".format(max_length))
|
199 |
-
|
200 |
-
content = ''
|
201 |
-
for chunk in chunks:
|
202 |
-
chunk_length = len( chunk.split(' ') )
|
203 |
-
chunk_max_length = 200
|
204 |
-
if chunk_length < max_length:
|
205 |
-
chunk_max_length = int( chunk_length / 2 )
|
206 |
-
chunk_summary = generate_summary( chunk, min( max_length, chunk_max_length ) )
|
207 |
-
for summary in chunk_summary:
|
208 |
-
content += summary['summary_text'] + ' '
|
209 |
-
else:
|
210 |
-
content = chunks[0]
|
211 |
-
|
212 |
-
summary = get_summary( url_id, content )
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
except Exception as exception:
|
215 |
exception_notice(exception)
|
|
|
216 |
|
217 |
progress_bar.progress( ( index + 1 ) / number_of_results )
|
218 |
|
|
|
219 |
col1, col2, col3 = st.columns(3)
|
220 |
with col1:
|
221 |
st.markdown('[Website Link]({})'.format(result['link']))
|
@@ -229,7 +273,7 @@ def main():
|
|
229 |
remove( 'summaries/' + url_id + '.json' )
|
230 |
|
231 |
st.markdown('---')
|
232 |
-
|
233 |
|
234 |
if __name__ == '__main__':
|
235 |
main()
|
|
|
19 |
Request Google Search API with query and return results.
|
20 |
"""
|
21 |
|
|
|
|
|
22 |
service = build(
|
23 |
"customsearch",
|
24 |
"v1",
|
25 |
+
developerKey=st.secrets["google_search_api_key"],
|
26 |
cache_discovery=False
|
27 |
)
|
28 |
|
|
|
31 |
|
32 |
return service.cse().list(
|
33 |
q=query,
|
34 |
+
cx=st.secrets["google_search_engine_id"],
|
35 |
num=5,
|
36 |
lr='lang_en', # lang_de
|
37 |
fields='items(title,link),searchInformation(totalResults)'
|
|
|
44 |
"""
|
45 |
file_path = 'search-results/' + slugify( query ) + '.json'
|
46 |
|
47 |
+
# Create cache directory if it doesn't exist.
|
48 |
makedirs(dirname(file_path), exist_ok=True)
|
49 |
+
|
50 |
+
results = []
|
51 |
+
# Check if cache file exists.
|
52 |
if exists( file_path ):
|
53 |
with open( file_path, 'r' ) as results_file:
|
54 |
results = json.load( results_file )
|
55 |
else:
|
56 |
search_result = google_search_api_request( query )
|
57 |
+
# Check if search contains results.
|
58 |
if int( search_result['searchInformation']['totalResults'] ) > 0:
|
59 |
results = search_result['items']
|
60 |
+
# Save results to cache file.
|
61 |
with open( file_path, 'w' ) as results_file:
|
62 |
json.dump( results, results_file )
|
63 |
|
|
|
66 |
|
67 |
return results
|
68 |
|
69 |
+
def get_summary( url, keywords ):
|
70 |
+
file_path = 'summaries/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
|
71 |
+
|
72 |
+
# Create cache directory if it doesn't exist.
|
73 |
makedirs(dirname(file_path), exist_ok=True)
|
74 |
+
|
75 |
+
# Check if cache file exists.
|
76 |
if exists( file_path ):
|
77 |
with open( file_path, 'r' ) as file:
|
78 |
summary = json.load( file )
|
79 |
else:
|
80 |
+
strings = get_url_content( url )
|
81 |
+
content = prep_chunks_summary( strings, keywords )
|
82 |
summary = generate_summary( content )
|
83 |
+
# Save results to cache file.
|
84 |
with open( file_path, 'w' ) as file:
|
85 |
json.dump( summary, file )
|
86 |
|
|
|
104 |
Helper function for exception notices.
|
105 |
"""
|
106 |
query_params = st.experimental_get_query_params()
|
107 |
+
# If debug mode is enabled, show exception else show warning.
|
108 |
if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
|
109 |
st.exception(exception)
|
110 |
else:
|
111 |
st.warning(str(exception))
|
112 |
|
113 |
+
# Unused function.
|
114 |
def is_keyword_in_string( keywords, string ):
|
115 |
"""
|
116 |
Checks if string contains keyword.
|
|
|
121 |
return False
|
122 |
|
123 |
def filter_sentences_by_keywords( strings, keywords ):
|
124 |
+
"""
|
125 |
+
Filter sentences by keywords using spacy.
|
126 |
+
"""
|
127 |
nlp = spacy.load("en_core_web_sm")
|
128 |
matcher = PhraseMatcher(nlp.vocab)
|
129 |
+
|
130 |
+
# Add keywords to matcher.
|
131 |
+
patterns = [nlp(keyword) for keyword in keywords]
|
132 |
matcher.add("QueryList", patterns)
|
133 |
|
134 |
sentences = []
|
135 |
for string in strings:
|
136 |
+
# Exclude sentences shorten than 5 words.
|
137 |
string_length = len( string.split(' ') )
|
138 |
if string_length < 5:
|
139 |
continue
|
140 |
+
|
141 |
+
# Loop through sentences and check if any of the keywords are in the sentence.
|
142 |
doc = nlp(string)
|
143 |
for sentence in doc.sents:
|
144 |
matches = matcher(nlp(sentence.text))
|
145 |
for match_id, start, end in matches:
|
146 |
+
# If keyword is in sentence, add sentence to list.
|
147 |
if nlp.vocab.strings[match_id] in ["QueryList"]:
|
148 |
sentences.append(sentence.text)
|
149 |
|
|
|
156 |
chunk = ''
|
157 |
word_count = 0
|
158 |
chunks = []
|
159 |
+
# Loop through sentences and split into chunks.
|
160 |
for sentence in sentences:
|
161 |
+
# Count words in sentence.
|
162 |
+
sentence_word_count = len(sentence.split(' '))
|
163 |
+
# If the word count plus the current sentence is larger then 512, start a new chunk.
|
164 |
+
if word_count + sentence_word_count > 512:
|
165 |
st.write("Number of words(tokens): {}".format(word_count))
|
166 |
chunks.append(chunk)
|
167 |
+
chunk = '' # Reset chunk.
|
168 |
+
word_count = 0 # Reset word count.
|
169 |
|
170 |
+
# Add sentence to chunk.
|
171 |
+
word_count += sentence_word_count
|
172 |
chunk += sentence + ' '
|
173 |
|
174 |
st.write("Number of words(tokens): {}".format(word_count))
|
|
|
176 |
|
177 |
return chunks
|
178 |
|
179 |
+
def prep_chunks_summary( strings, keywords ):
|
180 |
+
"""
|
181 |
+
Chunk summary.
|
182 |
+
"""
|
183 |
+
try:
|
184 |
+
sentences = filter_sentences_by_keywords( strings, keywords )
|
185 |
+
chunks = split_content_into_chunks( sentences )
|
186 |
+
|
187 |
+
number_of_chunks = len( chunks )
|
188 |
+
# Loop through chunks if there are more than one.
|
189 |
+
if number_of_chunks > 1:
|
190 |
+
# Calculate the max summary length based on the number of chunks.
|
191 |
+
max_length = int( 512 / number_of_chunks )
|
192 |
+
st.write("Max length: {}".format(max_length))
|
193 |
+
|
194 |
+
content = ''
|
195 |
+
# Loop through chunks and generate summary.
|
196 |
+
for chunk in chunks:
|
197 |
+
chunk_length = len( chunk.split(' ') )
|
198 |
+
# If chunk is shorter than max length, divide chunk length by 2.
|
199 |
+
if chunk_length < max_length:
|
200 |
+
max_length = int( chunk_length / 2 )
|
201 |
+
|
202 |
+
# Generate summary for chunk.
|
203 |
+
chunk_summary = generate_summary( chunk, max_length )
|
204 |
+
for summary in chunk_summary:
|
205 |
+
content += summary['summary_text'] + ' '
|
206 |
+
else:
|
207 |
+
content = chunks[0]
|
208 |
+
|
209 |
+
return content
|
210 |
+
|
211 |
+
except Exception as exception:
|
212 |
+
exception_notice(exception)
|
213 |
+
|
214 |
def main():
|
215 |
st.title('Racoon Search')
|
216 |
query = st.text_input('Search query')
|
|
|
224 |
exception_notice(exception)
|
225 |
return
|
226 |
|
227 |
+
# Count results.
|
228 |
number_of_results = len( results )
|
229 |
st.success( 'Found {} results for "{}".'.format( number_of_results, query ) )
|
230 |
|
231 |
+
# If debug mode is enabled, show search results in JSON.
|
232 |
if 'debug' in query_params.keys() and query_params['debug'][0] == 'true':
|
233 |
with st.expander("Search results JSON"):
|
234 |
if st.button('Delete search result cache', key=query + 'cache'):
|
|
|
244 |
for index, result in enumerate(results):
|
245 |
with st.container():
|
246 |
st.markdown('### ' + result['title'])
|
247 |
+
# Create a unique id for the result.
|
248 |
url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
+
# List of query keywords.
|
251 |
+
keywords = query.split(' ')
|
252 |
+
try :
|
253 |
+
# Create summary of summarized content.
|
254 |
+
summary = get_summary( result['link'], keywords )
|
255 |
+
st.markdown(summary[0]['summary_text'])
|
256 |
except Exception as exception:
|
257 |
exception_notice(exception)
|
258 |
+
return
|
259 |
|
260 |
progress_bar.progress( ( index + 1 ) / number_of_results )
|
261 |
|
262 |
+
# Show links and buttons.
|
263 |
col1, col2, col3 = st.columns(3)
|
264 |
with col1:
|
265 |
st.markdown('[Website Link]({})'.format(result['link']))
|
|
|
273 |
remove( 'summaries/' + url_id + '.json' )
|
274 |
|
275 |
st.markdown('---')
|
276 |
+
|
277 |
|
278 |
if __name__ == '__main__':
|
279 |
main()
|
beautiful_soup/beautiful_soup.py
CHANGED
@@ -14,9 +14,14 @@ import requests
|
|
14 |
- Export the text
|
15 |
'''
|
16 |
|
|
|
17 |
def get_url_content( url ):
|
18 |
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
|
|
|
|
|
19 |
makedirs(dirname(file_path), exist_ok=True)
|
|
|
|
|
20 |
if exists( file_path ):
|
21 |
with open( file_path, 'r' ) as file:
|
22 |
strings = json.load( file )
|
@@ -26,13 +31,16 @@ def get_url_content( url ):
|
|
26 |
except Exception as exception:
|
27 |
raise exception
|
28 |
|
|
|
29 |
with open( file_path, 'w' ) as file:
|
30 |
json.dump( strings, file )
|
31 |
|
32 |
return strings
|
33 |
|
|
|
34 |
def extract_strings( url ):
|
35 |
try :
|
|
|
36 |
soup = get_soup( url )
|
37 |
except Exception as exception:
|
38 |
raise exception
|
@@ -44,10 +52,12 @@ def extract_strings( url ):
|
|
44 |
for script in soup(["script", "style"]):
|
45 |
script.decompose()
|
46 |
|
|
|
47 |
content = get_main_content( soup )
|
48 |
if content is None :
|
49 |
raise Exception('No main content found.')
|
50 |
|
|
|
51 |
strings = get_tags_text( content )
|
52 |
if strings is None :
|
53 |
raise Exception('No text found.')
|
@@ -57,21 +67,26 @@ def extract_strings( url ):
|
|
57 |
def get_soup( url ):
|
58 |
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
|
59 |
makedirs(dirname(file_path), exist_ok=True)
|
|
|
60 |
if exists( file_path ):
|
61 |
with open( file_path, 'r' ) as web_page:
|
62 |
html = web_page.read()
|
63 |
else:
|
|
|
64 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
|
65 |
response = requests.get( url, headers=headers )
|
|
|
66 |
response.raise_for_status()
|
67 |
if not response.text:
|
68 |
raise Exception('HTML empty.')
|
69 |
html = response.text
|
|
|
70 |
with open( file_path, 'w' ) as file:
|
71 |
file.write( html )
|
72 |
|
73 |
return BeautifulSoup(html, 'html.parser')
|
74 |
|
|
|
75 |
def get_main_content( soup ):
|
76 |
|
77 |
content = soup.find( "div", { "class": "post-body" } )
|
@@ -141,10 +156,14 @@ def get_main_content( soup ):
|
|
141 |
|
142 |
return None
|
143 |
|
|
|
144 |
def get_tags_text( soup ):
|
145 |
text = []
|
|
|
146 |
tags = soup.find_all( allowed_tags )
|
|
|
147 |
for tag in tags:
|
|
|
148 |
if tag.name == 'div' :
|
149 |
for div in tag.find_all(text=True, recursive=False):
|
150 |
found_text = div.get_text( ' ', strip=True )
|
@@ -156,9 +175,11 @@ def get_tags_text( soup ):
|
|
156 |
text.append( found_text )
|
157 |
return text
|
158 |
|
|
|
159 |
def allowed_tags( tag ):
|
160 |
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
|
161 |
|
|
|
162 |
# -------------------------------------- #
|
163 |
|
164 |
# Extract content from main tag.
|
@@ -175,16 +196,3 @@ def get_tag_text( tags ):
|
|
175 |
print(tag.find_all('li'))
|
176 |
# text += [p.get_text() for p in tag.find_all('p)]
|
177 |
return text
|
178 |
-
|
179 |
-
def get_list_text( tags ):
|
180 |
-
list_items = []
|
181 |
-
for tag in tags:
|
182 |
-
list_items = tag.find_all(find_direct_text)
|
183 |
-
return list_items
|
184 |
-
|
185 |
-
def find_div_text( tag ):
|
186 |
-
return tag.name == 'div' and tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
|
187 |
-
|
188 |
-
if __name__ == '__main__':
|
189 |
-
url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
|
190 |
-
print(extract_content(url))
|
|
|
14 |
- Export the text
|
15 |
'''
|
16 |
|
17 |
+
# Get array of strings from page based off URL.
|
18 |
def get_url_content( url ):
|
19 |
file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
|
20 |
+
|
21 |
+
# Create directory if it doesn't exist.
|
22 |
makedirs(dirname(file_path), exist_ok=True)
|
23 |
+
|
24 |
+
# If cache file exists get content from cache.
|
25 |
if exists( file_path ):
|
26 |
with open( file_path, 'r' ) as file:
|
27 |
strings = json.load( file )
|
|
|
31 |
except Exception as exception:
|
32 |
raise exception
|
33 |
|
34 |
+
# Write strings to cache.
|
35 |
with open( file_path, 'w' ) as file:
|
36 |
json.dump( strings, file )
|
37 |
|
38 |
return strings
|
39 |
|
40 |
+
# Extract text from page based off URL.
|
41 |
def extract_strings( url ):
|
42 |
try :
|
43 |
+
# Parse html content using BeautifulSoup.
|
44 |
soup = get_soup( url )
|
45 |
except Exception as exception:
|
46 |
raise exception
|
|
|
52 |
for script in soup(["script", "style"]):
|
53 |
script.decompose()
|
54 |
|
55 |
+
# Get main content of html page.
|
56 |
content = get_main_content( soup )
|
57 |
if content is None :
|
58 |
raise Exception('No main content found.')
|
59 |
|
60 |
+
# Extract strings from main content based on allowed tags.
|
61 |
strings = get_tags_text( content )
|
62 |
if strings is None :
|
63 |
raise Exception('No text found.')
|
|
|
67 |
def get_soup( url ):
|
68 |
file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
|
69 |
makedirs(dirname(file_path), exist_ok=True)
|
70 |
+
# If cache file exists get content from cache.
|
71 |
if exists( file_path ):
|
72 |
with open( file_path, 'r' ) as web_page:
|
73 |
html = web_page.read()
|
74 |
else:
|
75 |
+
# Add user agent header to request to make request more realistic.
|
76 |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
|
77 |
response = requests.get( url, headers=headers )
|
78 |
+
# Raise exception if response is not 200.
|
79 |
response.raise_for_status()
|
80 |
if not response.text:
|
81 |
raise Exception('HTML empty.')
|
82 |
html = response.text
|
83 |
+
# Save html to cache.
|
84 |
with open( file_path, 'w' ) as file:
|
85 |
file.write( html )
|
86 |
|
87 |
return BeautifulSoup(html, 'html.parser')
|
88 |
|
89 |
+
# Find main content of html page based rules.
|
90 |
def get_main_content( soup ):
|
91 |
|
92 |
content = soup.find( "div", { "class": "post-body" } )
|
|
|
156 |
|
157 |
return None
|
158 |
|
159 |
+
# Extract text from allowed tags.
|
160 |
def get_tags_text( soup ):
|
161 |
text = []
|
162 |
+
# Find all tags that are allowed.
|
163 |
tags = soup.find_all( allowed_tags )
|
164 |
+
# Loop through tags and extract text.
|
165 |
for tag in tags:
|
166 |
+
# If div tag extract text from sub tags.
|
167 |
if tag.name == 'div' :
|
168 |
for div in tag.find_all(text=True, recursive=False):
|
169 |
found_text = div.get_text( ' ', strip=True )
|
|
|
175 |
text.append( found_text )
|
176 |
return text
|
177 |
|
178 |
+
# List of allowed tags.
|
179 |
def allowed_tags( tag ):
|
180 |
return tag.name == 'li' or tag.name == 'p' or tag.name == 'h1' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or tag.name == 'div'
|
181 |
|
182 |
+
## To be deleted.
|
183 |
# -------------------------------------- #
|
184 |
|
185 |
# Extract content from main tag.
|
|
|
196 |
print(tag.find_all('li'))
|
197 |
# text += [p.get_text() for p in tag.find_all('p)]
|
198 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|