Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Apr 22, 2022

Commit

1f95777

1 Parent(s): 151c2dd

Better exception handeling

Browse files

Files changed (3) hide show

.gitignore +1 -1
beautiful_soup/app.py +38 -25
main.py +40 -30

.gitignore CHANGED Viewed

@@ -1,5 +1,5 @@
 /.venv
 .env
 __pycache__
-/search-urls
 /web-pages

 /.venv
 .env
 __pycache__
+/search-results
 /web-pages

beautiful_soup/app.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from bs4 import BeautifulSoup
-import requests
 import uuid
 from os.path import exists
 '''
  - Error handing
@@ -14,26 +14,18 @@ from os.path import exists
 # Make request and get html content.
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
-    if ( exists( file_path ) ):
         with open( file_path, 'r' ) as web_page:
             html = web_page.read()
     else:
-        try:
-            request = requests.get(url)
-        except:
-            print('Unable to retrieve content, skipping URL')
-            return
-        if not request.ok:
-            print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) )
-            return
-        if not request.content:
-            print(request.content)
-            return
-        html = request.content
-        with open( file_path, 'wb' ) as file:
             file.write( html )
     return BeautifulSoup(html, 'html.parser')
@@ -60,16 +52,37 @@ def get_list_text( tags ):
     return list_items
 def find_direct_text( tag ):
-    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3'
 def extract_content( url ):
-    soup = get_soup( url )
-    if ( soup == None ):
-        return None
     main = get_main( soup )
-    if ( main == None ):
-        return 'No main tag found.'
-    return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'

 import uuid
 from os.path import exists
+from bs4 import BeautifulSoup
+import requests
 '''
  - Error handing
 # Make request and get html content.
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
+    if exists( file_path ):
         with open( file_path, 'r' ) as web_page:
             html = web_page.read()
     else:
+        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
+        response = requests.get( url, headers=headers )
+        response.raise_for_status()
+        if not response.text:
+            raise Exception('HTML empty.')
+        html = response.text
+        with open( file_path, 'w' ) as file:
             file.write( html )
     return BeautifulSoup(html, 'html.parser')
     return list_items
 def find_direct_text( tag ):
+    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3' or tag.name == 'span' or find_div_text( tag )
+def find_div_text( tag ):
+    return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
+def get_tags_text( soup ):
+    text = ''
+    tags = soup.find_all( find_direct_text )
+    for tag in tags:
+        if tag.name == 'div' and tag.find( text=True, recursive=False ) :
+            for div in tag.find_all(text=True, recursive=False):
+                text += div.get_text().strip() + ' '
+        else :
+            text += tag.get_text().strip() + ' '
+    return text
 def extract_content( url ):
+    try :
+        soup = get_soup( url )
+    except Exception as exception:
+        raise exception
+    if soup is None:
+        raise Exception('No content found.')
     main = get_main( soup )
+    if main is None :
+        raise Exception('No main tag found.')
+    return get_tags_text( main )
+    # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'

main.py CHANGED Viewed

@@ -1,23 +1,26 @@
 import streamlit as st
 from dotenv import load_dotenv
 from googleapiclient.discovery import build
-from functools import cache
 from slugify import slugify
-from os import getenv
-from os.path import exists
-import json
 from beautiful_soup.app import extract_content
 @cache
-def google_search( query ):
     api_key = getenv('GOOGLE_SEARCH_API_KEY')
-    # cx = os.getenv('GOOGLE_SEARCH_ENGIN_ID')
     service = build(
         "customsearch",
         "v1",
         developerKey=api_key,
-        cache_discovery=False
     )
     return service.cse().list(
@@ -25,35 +28,42 @@ def google_search( query ):
         cx='05048cc2df6134a06',
         ).execute()
 def main():
-    load_dotenv()
     st.title('Google Search')
     query = st.text_input('Search query')
-    if ( query ):
-        file_path = 'search-urls/' + slugify( query ) + '.json'
-        if ( exists( file_path ) ):
-            with open( file_path, 'r' ) as results_file:
-                results = json.load(results_file)
-        else:
-            search_result = google_search( query )
-            if( int( search_result['searchInformation']['totalResults'] ) > 0 ):
-                results = search_result['items']
-                with open( file_path, 'w' ) as results_file:
-                    json.dump( results, results_file )
-            else:
-                results = []
-        if ( len( results ) == 0 ) :
-            st.write( 'No results found.' )
-        try:
-            for item in results:
-                st.write(item['link'])
-                st.write(extract_content( item['link'] ))
-        except Exception as e:
-            st.exception(e)
 if __name__ == '__main__':
     main()

+from os import getenv
+from os.path import exists
+from functools import cache
+import json
 import streamlit as st
 from dotenv import load_dotenv
 from googleapiclient.discovery import build
 from slugify import slugify
 from beautiful_soup.app import extract_content
 @cache
+def google_search_api_request( query ):
+    load_dotenv()
     api_key = getenv('GOOGLE_SEARCH_API_KEY')
+    # cx = os.getenv('GOOGLE_SEARCH_ENGINE_ID')
     service = build(
         "customsearch",
         "v1",
         developerKey=api_key,
+        cache_discovery=False,
+        num=5
     )
     return service.cse().list(
         cx='05048cc2df6134a06',
         ).execute()
+def search_results( query ):
+    file_path = 'search-results/' + slugify( query ) + '.json'
+    results = []
+    if exists( file_path ):
+        with open( file_path, 'r' ) as results_file:
+            results = json.load( results_file )
+    else:
+        search_result = google_search_api_request( query )
+        if (  int( search_result['searchInformation']['totalResults'] ) > 0 ):
+            results = search_result['items']
+            with open( file_path, 'w' ) as results_file:
+                json.dump( results, results_file )
+    if ( len( results ) == 0 ) :
+        raise Exception('No results found.')
+    return results
 def main():
     st.title('Google Search')
     query = st.text_input('Search query')
+    if query :
+        try:
+            results = search_results( query )
+        except Exception as exception:
+            st.exception(exception)
+        for result in results:
+            st.write(result['link'])
+            try:
+                st.write( extract_content( result['link'] ) )
+            except Exception as exception:
+                st.exception(exception)
 if __name__ == '__main__':
     main()