Spaces:

grapplerulrich
/

raccoon

Sleeping

App Files Files

grapplerulrich commited on Apr 8, 2022

Commit

8b32433

1 Parent(s): 35d7624

inital version with small test for beautiful soup

Browse files

Files changed (5) hide show

.gitignore +1 -0
beautiful-soup/app.py +67 -0
beautiful-soup/requirements.txt +1 -0
beautiful-soup/test.py +38 -0
google-search/requirements.txt +1 -0

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 /.venv
 .env

 /.venv
 .env
+__pycache__

beautiful-soup/app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from bs4 import BeautifulSoup
+import requests
+'''
+ - Error handing
+ - Look if alternative to main tag is needed. Provide error message if main tag is not found.
+ - Menus are li tags with a tags within.
+ - li tags with text and tags should be exported
+ - Find divs that have text or p tags maybe other tags like divs
+ - Export the text
+'''
+# Make request and get html content.
+def get_soup( url ):
+    # try:
+        # request = requests.get(url)
+    # except:
+    #     print('Unable to retrieve content, skipping URL')
+    #    return
+    # if not request.ok:
+    #     print("Unable to retrieve content, skipping URL. Status code: {}".format( request.status_code ))
+    #     return
+    request = requests.get(url)
+    html = request.content
+    soup = BeautifulSoup(html, 'html.parser')
+    return soup
+# Extract content from main tag.
+def get_main( soup ):
+    return soup.main
+def is_childless( tag ):
+    return len( tag.find_all('div') ) == 0
+def get_divs( tag ):
+    # Get all the divs from within the main tag.
+    divs = tag.find_all('div')
+    return filter( is_childless, divs )
+def extract_content( url ):
+    soup = get_soup( url )
+    main = get_main( soup )
+    divs = get_divs( main )
+    return [p.get_text() for p in div.find_all('p')]
+#   # Get all the divs from within the main tag.
+#   divs = soup.main.find_all('div')
+#   for div in divs:
+#     # Get all of the divs that do not have further divs within.
+#     no_child_div = len(div.find_all('div')) == 0
+#     if no_child_div:
+#       # Find all p tags in the div.
+#       content += [p.get_text() for p in div.find_all('p')]
+#       # Find all li in the div.
+#       for li in div.find_all('li'):
+#         #
+#         content += ''.join(li.find_all(text=True, recursive=False))
+#       content += ''.join(div.find_all(text=True, recursive=False))
+#   return content
+if __name__ == '__main':
+  url = 'https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans'
+  print(extract_content(url))

beautiful-soup/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ beautifulsoup4

beautiful-soup/test.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import unittest
+from bs4 import BeautifulSoup
+import app
+class BeautifulSoupTest(unittest.TestCase):
+    def test_beautiful_soup(self):
+        self.assertTrue(True)
+    def test_main_tag(self):
+        html = '''
+        <html>
+            <head>  </head>
+            <body>
+                <main>
+                    <div>
+                        <ul>
+                            <li><a href="https://www.cms.gov/Medicare/Billing/ElectronicBillingEDITrans">Electronic Billing</a></li>
+                            <li><a href="https://www.cms.gov/Medicare/Billing/BillingFAQs">Billing FAQs</a></li>
+                        </ul>
+                    </div>
+                    <div>
+                        <div>
+                            <p>Paragraph</p>
+                            <ul>
+                                <li>List Item</li>
+                            </ul>
+                            Text within div
+                        </div>
+                    </div>
+                </main>
+            </body>
+        </html>
+        '''
+        soup = BeautifulSoup(html, 'html.parser')
+        self.assertEqual( app.get_main( soup ).name, 'main' )
+if __name__ == '__main__':
+    unittest.main()

google-search/requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 streamlit
 google
 python-dotenv

 streamlit
 google
 python-dotenv
+beautifulsoup4