Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Apr 30, 2022

Commit

561abab

1 Parent(s): 9c1234d

Exclude pdf from search results

Browse files

- reorder beautiful soup code
- add extra content classes / id

Files changed (2) hide show

beautiful_soup/app.py +62 -52
main.py +31 -23

beautiful_soup/app.py CHANGED Viewed

@@ -12,6 +12,36 @@ import requests
  - Export the text
 '''
 # Make request and get html content.
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
@@ -31,16 +61,7 @@ def get_soup( url ):
     return BeautifulSoup(html, 'html.parser')
-# Extract content from main tag.
-def get_main( soup ):
-    return soup.main
 def get_main_content( soup ):
-    content = soup.main
-    if content is not None:
-        print('Has main tag.')
-        return content
     content = soup.find( "div", { "class": "post-body" } )
     if content is not None:
@@ -72,6 +93,21 @@ def get_main_content( soup ):
         print('Has .article-inner_html class.')
         return content
     content = soup.find( "article" )
     if content is not None:
         print('Has article tag.')
@@ -79,6 +115,23 @@ def get_main_content( soup ):
     return None
 def get_deepest_divs( tag ):
     # Get all the divs from within a tag.
     return [div for div in tag.findAll('div') if not div.find('div')]
@@ -102,49 +155,6 @@ def find_direct_text( tag ):
 def find_div_text( tag ):
     return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
-def get_tags_text( soup ):
-    text = ''
-    tags = soup.find_all( find_direct_text )
-    for tag in tags:
-        if tag.name == 'div' and tag.find( text=True, recursive=False ) :
-            for div in tag.find_all(text=True, recursive=False):
-                text += div.get_text().strip() + ' '
-        else :
-            text += tag.get_text().strip() + ' '
-    return text
-def extract_content( url ):
-    try :
-        soup = get_soup( url )
-    except Exception as exception:
-        raise exception
-    if soup is None:
-        raise Exception('No HTML content found.')
-    content = get_main_content( soup )
-    if content is None :
-        # content = soup.body
-        raise Exception('No main content found.')
-    return get_tags_text( content )
-    # return ''.join([' ' + tag.get_text().strip() for tag in main.find_all( find_direct_text )])
-def get_url_content( url ):
-    file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
-    if exists( file_path ):
-        with open( file_path, 'r' ) as file_content:
-            content = file_content.read()
-    else:
-        try:
-            content = extract_content( url )
-        except Exception as exception:
-            raise exception
-        with open( file_path, 'w' ) as file:
-            file.write( content )
-    return content
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
   print(extract_content(url))

  - Export the text
 '''
+def get_url_content( url ):
+    file_path = 'page-content/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.txt'
+    if exists( file_path ):
+        with open( file_path, 'r' ) as file_content:
+            content = file_content.read()
+    else:
+        try:
+            content = extract_content( url )
+        except Exception as exception:
+            raise exception
+        with open( file_path, 'w' ) as file:
+            file.write( content )
+    return content
+def extract_content( url ):
+    try :
+        soup = get_soup( url )
+    except Exception as exception:
+        raise exception
+    if soup is None:
+        raise Exception('No HTML content found.')
+    content = get_main_content( soup )
+    if content is None :
+        raise Exception('No main content found.')
+    return get_tags_text( content )
 # Make request and get html content.
 def get_soup( url ):
     file_path = 'web-pages/' + uuid.uuid5( uuid.NAMESPACE_URL, url ).hex + '.html'
     return BeautifulSoup(html, 'html.parser')
 def get_main_content( soup ):
     content = soup.find( "div", { "class": "post-body" } )
     if content is not None:
         print('Has .article-inner_html class.')
         return content
+    content = soup.find( "div", { "id": "bmdDetail-Content" } )
+    if content is not None:
+        print('Has .bmdDetail-Content id.')
+        return content
+    content = soup.find( "div", { "id": "main" } )
+    if content is not None:
+        print('Has .bmdDetail-Content id.')
+        return content
+    content = soup.main
+    if content is not None:
+        print('Has main tag.')
+        return content
     content = soup.find( "article" )
     if content is not None:
         print('Has article tag.')
     return None
+def get_tags_text( soup ):
+    text = ''
+    tags = soup.find_all( find_direct_text )
+    for tag in tags:
+        if tag.name == 'div' and tag.find( text=True, recursive=False ) :
+            for div in tag.find_all(text=True, recursive=False):
+                text += div.get_text().strip() + ' '
+        else :
+            text += tag.get_text().strip() + ' '
+    return text
+# -------------------------------------- #
+# Extract content from main tag.
+def get_main( soup ):
+    return soup.main
 def get_deepest_divs( tag ):
     # Get all the divs from within a tag.
     return [div for div in tag.findAll('div') if not div.find('div')]
 def find_div_text( tag ):
     return tag.name == 'div' or tag.find( text=True, recursive=False ) and tag.find( text=True, recursive=False ).strip()
 if __name__ == '__main__':
   url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
   print(extract_content(url))

main.py CHANGED Viewed

@@ -24,6 +24,9 @@ def google_search_api_request( query ):
         cache_discovery=False
     )
     return service.cse().list(
         q=query,
         cx='05048cc2df6134a06',
@@ -52,36 +55,41 @@ def search_results( query ):
 def main():
     st.title('Google Search')
     query = st.text_input('Search query')
     if query :
-        try:
-            results = search_results( query )
-        except Exception as exception:
-            st.exception(exception)
-        for result in results:
-            st.write(result['link'])
             try:
-                content = get_url_content( result['link'] )
             except Exception as exception:
                 st.exception(exception)
-            file_path = 'summaries/' + uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex + '.json'
-            if exists( file_path ):
-                with open( file_path, 'r' ) as file:
-                    summary = json.load( file )
-            else:
                 try:
-                    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
-                    summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True)
                 except Exception as exception:
-                    raise exception
-                with open( file_path, 'w' ) as file:
-                    json.dump( summary, file )
-            for sentence in summary:
-                st.write(sentence['summary_text'])
 if __name__ == '__main__':
     main()

         cache_discovery=False
     )
+    # Exclude PDFs from search results.
+    query = query + ' -filetype:pdf'
     return service.cse().list(
         q=query,
         cx='05048cc2df6134a06',
 def main():
     st.title('Google Search')
     query = st.text_input('Search query')
     if query :
+        with st.spinner('Loading search results...'):
             try:
+                results = search_results( query )
             except Exception as exception:
                 st.exception(exception)
+                return
+            for result in results:
+                url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex
+                st.write(result['link'])
+                st.write(url_id)
                 try:
+                    content = get_url_content( result['link'] )
                 except Exception as exception:
+                    st.exception(exception)
+                    continue
+                file_path = 'summaries/' + url_id + '.json'
+                if exists( file_path ):
+                    with open( file_path, 'r' ) as file:
+                        summary = json.load( file )
+                else:
+                    try:
+                        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+                        summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True)
+                    except Exception as exception:
+                        raise exception
+                    with open( file_path, 'w' ) as file:
+                        json.dump( summary, file )
+                for sentence in summary:
+                    st.write(sentence['summary_text'])
 if __name__ == '__main__':
     main()