Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Apr 19, 2022

Commit

151c2dd

1 Parent(s): 152531a

Add caching and save search results url and HTML

Browse files

Files changed (5) hide show

.gitignore +2 -0
beautiful_soup/app.py +46 -37
beautiful_soup/test.py +104 -8
main.py +59 -0
requirements.txt +5 -90

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 /.venv
 .env
 __pycache__

 /.venv
 .env
 __pycache__
+/search-urls
+/web-pages

beautiful_soup/app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from bs4 import BeautifulSoup
 import requests
 '''
  - Error handing
@@ -12,56 +14,63 @@ import requests
 # Make request and get html content.
 def get_soup( url ):
-    # try:
-        # request = requests.get(url)
-    # except:
-    #     print('Unable to retrieve content, skipping URL')
-    #    return
-    # if not request.ok:
-    #     print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
-    #     return
-    request = requests.get(url)
-    html = request.content
-    soup = BeautifulSoup(html, 'html.parser')
-    return soup
 # Extract content from main tag.
 def get_main( soup ):
     return soup.main
-def is_childless( tag ):
-    return len( tag.find_all('div') ) == 0
-def get_divs( tag ):
-    # Get all the divs from within the main tag.
-    divs = tag.find_all('div')
-    return filter( is_childless, divs )
 def extract_content( url ):
     soup = get_soup( url )
     main = get_main( soup )
-    divs = get_divs( main )
-    return [p.get_text() for p in div.find_all('p')]
-#   # Get all the divs from within the main tag.
-#   divs = soup.main.find_all('div')
-#   for div in divs:
-#     # Get all of the divs that do not have further divs within.
-#     no_child_div = len(div.find_all('div')) == 0
-#     if no_child_div:
-#       # Find all p tags in the div.
-#       content += [p.get_text() for p in div.find_all('p')]
-#       # Find all li in the div.
-#       for li in div.find_all('li'):
-#         #
-#         content += ''.join(li.find_all(text=True, recursive=False))
-#       content += ''.join(div.find_all(text=True, recursive=False))
-#   return content
-if __name__ == '__main':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
   print(extract_content(url))

 from bs4 import BeautifulSoup
 import requests
+import uuid
+from os.path import exists
 '''
  - Error handing
 # Make request and get html content.
 def get_soup( url ):
+    file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
+    if ( exists( file_path ) ):
+        with open( file_path, 'r' ) as web_page:
+            html = web_page.read()
+    else:
+        try:
+            request = requests.get(url)
+        except:
+            print('Unable to retrieve content, skipping URL')
+            return
+        if not request.ok:
+            print( "Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ) )
+            return
+        if not request.content:
+            print(request.content)
+            return
+        html = request.content
+        with open( file_path, 'wb' ) as file:
+            file.write( html )
+    return BeautifulSoup(html, 'html.parser')
 # Extract content from main tag.
 def get_main( soup ):
     return soup.main
+def get_deepest_divs( tag ):
+    # Get all the divs from within a tag.
+    return [div for div in tag.findAll('div') if not div.find('div')]
+def get_tag_text( tags ):
+    text = ''
+    for tag in tags:
+        print(tag.find_all('li'))
+        # text += [p.get_text() for p in tag.find_all('p)]
+    return text
+def get_list_text( tags ):
+    list_items = []
+    for tag in tags:
+        list_items = tag.find_all(find_direct_text)
+    return list_items
+def find_direct_text( tag ):
+    return tag.name == 'li' or tag.name == 'p' or tag.name == 'h2' or tag.name == 'h3'
 def extract_content( url ):
     soup = get_soup( url )
+    if ( soup == None ):
+        return None
     main = get_main( soup )
+    if ( main == None ):
+        return 'No main tag found.'
+    return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
+if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
   print(extract_content(url))

beautiful_soup/test.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import unittest
 from bs4 import BeautifulSoup
-import app
 class BeautifulSoupTest(unittest.TestCase):
-    def test_beautiful_soup(self):
-        self.assertTrue(True)
-    def test_main_tag(self):
-        html = '''
         <html>
-            <head>  </head>
             <body>
                 <main>
                     <div>
@@ -31,8 +29,106 @@ class BeautifulSoupTest(unittest.TestCase):
             </body>
         </html>
         '''
-        soup = BeautifulSoup(html, 'html.parser')
-        self.assertEqual( app.get_main( soup ).name, 'main' )
 if __name__ == '__main__':
     unittest.main()

 import unittest
 from bs4 import BeautifulSoup
+import beautiful_soup
 class BeautifulSoupTest(unittest.TestCase):
+    def setUp(self):
+        self.html = '''
         <html>
+            <head></head>
             <body>
                 <main>
                     <div>
             </body>
         </html>
         '''
+    def test_main_tag(self):
+        soup = BeautifulSoup( self.html, 'html.parser' )
+        self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
+        soup = BeautifulSoup( "", 'html.parser' )
+        self.assertEqual( beautiful_soup.get_main( soup ).name, 'main' )
+    def test_has_no_div_childre(self):
+        childless = '''
+        <html>
+            <body>
+                <div><p>Text in div.</p></div>
+            </body>
+        </html>
+        '''
+        soup = BeautifulSoup( childless, 'html.parser' )
+        # self.assertFalse( beautiful_soup.has_no_div_children( soup.body ) )
+        # self.assertTrue( beautiful_soup.has_no_div_children( soup.body.div ) )
+        nested_div = '''
+        <html>
+            <body>
+                <div>
+                    <div>Text in paragraph.</div>
+                </div>
+            </body>
+        </html>
+        '''
+        soup = BeautifulSoup( nested_div, 'html.parser' )
+        # self.assertFalse( beautiful_soup.has_no_div_children( soup.body.div ) )
+    def test_get_deepest_divs(self):
+        nested_div = '''
+        <html>
+            <body>
+                <div>
+                    <div><p>Text in paragraph.</p></div>
+                </div>
+            </body>
+        </html>
+        '''
+        soup = BeautifulSoup( nested_div, 'html.parser' )
+        self.assertEqual( beautiful_soup.get_deepest_divs( soup.body )[0].text, 'Text in paragraph.' )
+    def test_list(self):
+        nested_div = '''
+        <html>
+            <body>
+                <div>
+                    <ul>
+                        <li>Text in list.</li>
+                        <li><a href"">Link in list.</a></li>
+                        <li>Text with <a href"">Link</a> in list.</li>
+                    </ul>
+                </div>
+            </body>
+        </html>
+        '''
+        soup = BeautifulSoup( nested_div, 'html.parser' )
+        divs = beautiful_soup.get_deepest_divs( soup.body )
+        # self.assertEqual( beautiful_soup.get_list_text( divs )[0], 'Text in list.' )
+    def test_exlcude_links(self):
+        nested_div = '''
+        <li><a href='somelink'>I DONT WANT THIS</a></li>
+        <li>blablalba <a href='both'>I WANT THIS</a> blalba</li>
+        <li><a href='right'>I WANT THIS</a> blalba</li>
+        <li>blablalba <a href='left'>I WANT THIS</a></li>
+        <p><a href='somelink'>I WANT THIS</a></p>
+        <p>blablalba <a href='both'>I WANT THIS</a> blalba</p>
+        <p><a href='right'>I WANT THIS</a> blalba</p>
+        <p>blablalba <a href='left'>I WANT THIS</a></p>
+        '''
+        soup = BeautifulSoup( nested_div, 'html.parser' )
+        list_items = soup.find_all(beautiful_soup.find_direct_text)
+        results = [
+            'blablalba I WANT THIS blalba',
+            'I WANT THIS blalba',
+            'blablalba I WANT THIS',
+            'I WANT THIS',
+            'blablalba I WANT THIS blalba',
+            'I WANT THIS blalba',
+            'blablalba I WANT THIS'
+        ]
+        print(list_items)
+        # for item in list_items:
+        #     print('item.get_text(): ' + item.get_text())
+        # help(list_items)
+        for i, item in enumerate(list_items):
+            self.assertEqual( item.get_text(), results[i] )
+        # self.assertEqual( list_items[0].get_text(), 'blablalba I WANT THIS blalba' )
+        # self.assertEqual( list_items[1].get_text(), 'I WANT THI Sblalba' )
+        # self.assertEqual( list_items[2].get_text(), 'blablalba I WANT THIS' )
 if __name__ == '__main__':
     unittest.main()

main.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+from dotenv import load_dotenv
+from googleapiclient.discovery import build
+from functools import cache
+from slugify import slugify
+from os import getenv
+from os.path import exists
+import json
+from beautiful_soup.app import extract_content
+@cache
+def google_search( query ):
+    api_key = getenv('GOOGLE_SEARCH_API_KEY')
+    # cx = os.getenv('GOOGLE_SEARCH_ENGIN_ID')
+    service = build(
+        "customsearch",
+        "v1",
+        developerKey=api_key,
+        cache_discovery=False
+    )
+    return service.cse().list(
+        q=query,
+        cx='05048cc2df6134a06',
+        ).execute()
+def main():
+    load_dotenv()
+    st.title('Google Search')
+    query = st.text_input('Search query')
+    if ( query ):
+        file_path = 'search-urls/' + slugify( query ) + '.json'
+        if ( exists( file_path ) ):
+            with open( file_path, 'r' ) as results_file:
+                results = json.load(results_file)
+        else:
+            search_result = google_search( query )
+            if( int( search_result['searchInformation']['totalResults'] ) > 0 ):
+                results = search_result['items']
+                with open( file_path, 'w' ) as results_file:
+                    json.dump( results, results_file )
+            else:
+                results = []
+        if ( len( results ) == 0 ) :
+            st.write( 'No results found.' )
+        try:
+            for item in results:
+                st.write(item['link'])
+                st.write(extract_content( item['link'] ))
+        except Exception as e:
+            st.exception(e)
+if __name__ == '__main__':
+    main()

requirements.txt CHANGED Viewed

@@ -1,90 +1,5 @@
-altair==4.2.0
-appnope==0.1.2
-argon2-cffi==21.3.0
-argon2-cffi-bindings==21.2.0
-asttokens==2.0.5
-attrs==21.4.0
-backcall==0.2.0
-beautifulsoup4==4.10.0
-bleach==4.1.0
-blinker==1.4
-cachetools==5.0.0
-certifi==2021.10.8
-cffi==1.15.0
-charset-normalizer==2.0.12
-click==8.0.4
-debugpy==1.6.0
-decorator==5.1.1
-defusedxml==0.7.1
-entrypoints==0.4
-executing==0.8.3
-gitdb==4.0.9
-GitPython==3.1.27
-idna==3.3
-importlib-metadata==4.11.3
-ipykernel==6.11.0
-ipython==8.2.0
-ipython-genutils==0.2.0
-ipywidgets==7.7.0
-jedi==0.18.1
-Jinja2==3.1.1
-jsonschema==4.4.0
-jupyter-client==7.2.1
-jupyter-core==4.9.2
-jupyterlab-pygments==0.1.2
-jupyterlab-widgets==1.1.0
-MarkupSafe==2.1.1
-matplotlib-inline==0.1.3
-mistune==0.8.4
-nbclient==0.5.13
-nbconvert==6.4.5
-nbformat==5.2.0
-nest-asyncio==1.5.4
-notebook==6.4.10
-numpy==1.22.3
-packaging==21.3
-pandas==1.4.1
-pandocfilters==1.5.0
-parso==0.8.3
-pexpect==4.8.0
-pickleshare==0.7.5
-Pillow==9.0.1
-prometheus-client==0.13.1
-prompt-toolkit==3.0.28
-protobuf==3.19.4
-psutil==5.9.0
-ptyprocess==0.7.0
-pure-eval==0.2.2
-pyarrow==7.0.0
-pycparser==2.21
-pydeck==0.7.1
-Pygments==2.11.2
-Pympler==1.0.1
-pyparsing==3.0.7
-pyrsistent==0.18.1
-python-dateutil==2.8.2
-pytz==2022.1
-pytz-deprecation-shim==0.1.0.post0
-pyzmq==22.3.0
-requests==2.27.1
-semver==2.13.0
-Send2Trash==1.8.0
-six==1.16.0
-smmap==5.0.0
-soupsieve==2.3.1
-stack-data==0.2.0
-streamlit==1.8.1
-terminado==0.13.3
-testpath==0.6.0
-toml==0.10.2
-toolz==0.11.2
-tornado==6.1
-traitlets==5.1.1
-tzdata==2022.1
-tzlocal==4.1
-urllib3==1.26.9
-validators==0.18.2
-wcwidth==0.2.5
-webencodings==0.5.1
-widgetsnbextension==3.6.0
-zipp==3.7.0

+streamlit
+google
+python-dotenv
+beautifulsoup4
+python-slugify