Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on May 8, 2022

Commit

1f86974

1 Parent(s): b72fb4c

Save strings as json instead of text

Browse files

Allows for better processing of sentences

Files changed (2) hide show

README.md +1 -0
beautiful_soup/beautiful_soup.py +17 -17

README.md CHANGED Viewed

@@ -39,6 +39,7 @@ google_search_engine_id = "search-engine-id"
 - [ ] Improve fetched content.
   - [ ] Get some content from every search result.
   - [ ] Find sentences that contain the search keywords.
 - [ ] Summarization requires truncation. Find solution where not needed.
 - [ ] Support German content.
 - [ ] Improve queries to include more keywords (Expand abrivations & define context)

 - [ ] Improve fetched content.
   - [ ] Get some content from every search result.
   - [ ] Find sentences that contain the search keywords.
+  - [ ] Div's with text & tags. Extract text from tags and then decompose the tags.
 - [ ] Summarization requires truncation. Find solution where not needed.
 - [ ] Support German content.
 - [ ] Improve queries to include more keywords (Expand abrivations & define context)

beautiful_soup/beautiful_soup.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import uuid
 from os import makedirs, remove
 from os.path import exists, dirname
 from bs4 import BeautifulSoup
@@ -14,22 +15,22 @@ import requests
 '''
 def get_url_content( url ):
-    file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
-    makedirs(dirname(file_path), exist_ok=True)
-    if exists( file_path ):
-        with open( file_path, 'r' ) as file_content:
-            content = file_content.read()
     else:
         try:
-            content = extract_content( url )
         except Exception as exception:
             raise exception
-        with open( file_path, 'w' ) as file:
-            file.write( content.strip() )
-    return content
-def extract_content( url ):
     try :
         soup = get_soup( url )
     except Exception as exception:
@@ -46,11 +47,10 @@ def extract_content( url ):
     if content is None :
         raise Exception('No main content found.')
-    text = get_tags_text( content )
-    if text is None :
         raise Exception('No text found.')
-    return text
 # Make request and get html content.
 def get_soup( url ):
@@ -136,16 +136,16 @@ def get_main_content( soup ):
     return None
 def get_tags_text( soup ):
-    text = ''
     tags = soup.find_all( allowed_tags )
     for tag in tags:
         if tag.name == 'div' :
             for div in tag.find_all(text=True, recursive=False):
                 found_text = div.get_text( ' ', strip=True )
                 if found_text != '':
-                    text += found_text
         else :
-            text += tag.get_text( ' ', strip=True ) + ' '
     return text
 def allowed_tags( tag ):

 import uuid
+import json
 from os import makedirs, remove
 from os.path import exists, dirname
 from bs4 import BeautifulSoup
 '''
 def get_url_content( url ):
+    file = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.json'
+    makedirs(dirname(file), exist_ok=True)
+    if exists( file ):
+        with open( file, 'r' ) as file_content:
+            strings = json.load( file )
     else:
         try:
+            strings = extract_strings( url )
         except Exception as exception:
             raise exception
+        with open( file, 'w' ) as file:
+            json.dump( strings, file )
+    return strings
+def extract_strings( url ):
     try :
         soup = get_soup( url )
     except Exception as exception:
     if content is None :
         raise Exception('No main content found.')
+    strings = get_tags_text( content )
+    if strings is None :
         raise Exception('No text found.')
+    return strings
 # Make request and get html content.
 def get_soup( url ):
     return None
 def get_tags_text( soup ):
+    text = []
     tags = soup.find_all( allowed_tags )
     for tag in tags:
         if tag.name == 'div' :
             for div in tag.find_all(text=True, recursive=False):
                 found_text = div.get_text( ' ', strip=True )
                 if found_text != '':
+                    text.append( found_text )
         else :
+            text.append( tag.get_text( ' ', strip=True ))
     return text
 def allowed_tags( tag ):