Spaces:
Sleeping
Sleeping
Commit
·
1f86974
1
Parent(s):
b72fb4c
Save strings as json instead of text
Browse filesAllows for better processing of sentences
- README.md +1 -0
- beautiful_soup/beautiful_soup.py +17 -17
README.md
CHANGED
@@ -39,6 +39,7 @@ google_search_engine_id = "search-engine-id"
|
|
39 |
- [ ] Improve fetched content.
|
40 |
- [ ] Get some content from every search result.
|
41 |
- [ ] Find sentences that contain the search keywords.
|
|
|
42 |
- [ ] Summarization requires truncation. Find solution where not needed.
|
43 |
- [ ] Support German content.
|
44 |
- [ ] Improve queries to include more keywords (Expand abrivations & define context)
|
|
|
39 |
- [ ] Improve fetched content.
|
40 |
- [ ] Get some content from every search result.
|
41 |
- [ ] Find sentences that contain the search keywords.
|
42 |
+
- [ ] Div's with text & tags. Extract text from tags and then decompose the tags.
|
43 |
- [ ] Summarization requires truncation. Find solution where not needed.
|
44 |
- [ ] Support German content.
|
45 |
- [ ] Improve queries to include more keywords (Expand abrivations & define context)
|
beautiful_soup/beautiful_soup.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import uuid
|
|
|
2 |
from os import makedirs, remove
|
3 |
from os.path import exists, dirname
|
4 |
from bs4 import BeautifulSoup
|
@@ -14,22 +15,22 @@ import requests
|
|
14 |
'''
|
15 |
|
16 |
def get_url_content( url ):
|
17 |
-
|
18 |
-
makedirs(dirname(
|
19 |
-
if exists(
|
20 |
-
with open(
|
21 |
-
|
22 |
else:
|
23 |
try:
|
24 |
-
|
25 |
except Exception as exception:
|
26 |
raise exception
|
27 |
-
with open(
|
28 |
-
|
29 |
|
30 |
-
return
|
31 |
|
32 |
-
def
|
33 |
try :
|
34 |
soup = get_soup( url )
|
35 |
except Exception as exception:
|
@@ -46,11 +47,10 @@ def extract_content( url ):
|
|
46 |
if content is None :
|
47 |
raise Exception('No main content found.')
|
48 |
|
49 |
-
|
50 |
-
if
|
51 |
raise Exception('No text found.')
|
52 |
-
|
53 |
-
return text
|
54 |
|
55 |
# Make request and get html content.
|
56 |
def get_soup( url ):
|
@@ -136,16 +136,16 @@ def get_main_content( soup ):
|
|
136 |
return None
|
137 |
|
138 |
def get_tags_text( soup ):
|
139 |
-
text =
|
140 |
tags = soup.find_all( allowed_tags )
|
141 |
for tag in tags:
|
142 |
if tag.name == 'div' :
|
143 |
for div in tag.find_all(text=True, recursive=False):
|
144 |
found_text = div.get_text( ' ', strip=True )
|
145 |
if found_text != '':
|
146 |
-
text
|
147 |
else :
|
148 |
-
text
|
149 |
return text
|
150 |
|
151 |
def allowed_tags( tag ):
|
|
|
1 |
import uuid
|
2 |
+
import json
|
3 |
from os import makedirs, remove
|
4 |
from os.path import exists, dirname
|
5 |
from bs4 import BeautifulSoup
|
|
|
15 |
'''
|
16 |
|
17 |
def get_url_content( url ):
|
18 |
+
file = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
|
19 |
+
makedirs(dirname(file), exist_ok=True)
|
20 |
+
if exists( file ):
|
21 |
+
with open( file, 'r' ) as file_content:
|
22 |
+
strings = json.load( file )
|
23 |
else:
|
24 |
try:
|
25 |
+
strings = extract_strings( url )
|
26 |
except Exception as exception:
|
27 |
raise exception
|
28 |
+
with open( file, 'w' ) as file:
|
29 |
+
json.dump( strings, file )
|
30 |
|
31 |
+
return strings
|
32 |
|
33 |
+
def extract_strings( url ):
|
34 |
try :
|
35 |
soup = get_soup( url )
|
36 |
except Exception as exception:
|
|
|
47 |
if content is None :
|
48 |
raise Exception('No main content found.')
|
49 |
|
50 |
+
strings = get_tags_text( content )
|
51 |
+
if strings is None :
|
52 |
raise Exception('No text found.')
|
53 |
+
return strings
|
|
|
54 |
|
55 |
# Make request and get html content.
|
56 |
def get_soup( url ):
|
|
|
136 |
return None
|
137 |
|
138 |
def get_tags_text( soup ):
|
139 |
+
text = []
|
140 |
tags = soup.find_all( allowed_tags )
|
141 |
for tag in tags:
|
142 |
if tag.name == 'div' :
|
143 |
for div in tag.find_all(text=True, recursive=False):
|
144 |
found_text = div.get_text( ' ', strip=True )
|
145 |
if found_text != '':
|
146 |
+
text.append( found_text )
|
147 |
else :
|
148 |
+
text.append( tag.get_text( ' ', strip=True ))
|
149 |
return text
|
150 |
|
151 |
def allowed_tags( tag ):
|