Spaces:

Insightly
/

Web_Scraper_API

Runtime error

shreyasiv commited on Aug 7, 2023

Commit

f0779c6

1 Parent(s): c1a162b

Upload 3 files

Files changed (3) hide show

app.py ADDED Viewed

+# streamlit_app.py
+import streamlit as st
+import requests
+# Streamlit UI
+def main():
+    st.title("Web Data Scraper")
+    # Get the URL from the user
+    url_input = st.text_input("Enter the URL of the web page:", "")
+    if st.button("Scrape Visible Text"):
+        if url_input:
+            # Make a GET request to the FastAPI server with the URL
+            response = requests.get(f"http://localhost:8000/?url={url_input}")
+            if response.status_code == 200:
+                data = response.json()
+                if "scraped_text" in data:
+                    st.success("Visible text successfully scraped!")
+                    st.subheader("Scraped Text:")
+                    st.write(data["scraped_text"])
+                else:
+                    st.warning("Failed to scrape visible text from the URL.")
+            else:
+                st.warning("Failed to connect to the FastAPI server.")
+        else:
+            st.warning("Please enter a valid URL.")
+if __name__ == "__main__":
+    main()

fastapi_server.py ADDED Viewed

+# fastapi_server.py
+from fastapi import FastAPI
+import requests
+from bs4 import BeautifulSoup
+import re
+app = FastAPI()
+# Function to scrape only visible text from the given URL
+def scrape_visible_text_from_url(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script, style, and other non-visible tags
+        for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
+            tag.extract()
+        # Get the header content
+        header_content = soup.find("header")
+        header_text = header_content.get_text() if header_content else ""
+        # Get the paragraph content
+        paragraph_content = soup.find_all("p")
+        paragraph_text = " ".join([p.get_text() for p in paragraph_content])
+        # Combine header and paragraph text
+        visible_text = f"{header_text}\n\n{paragraph_text}"
+        # Remove multiple whitespaces and newlines
+        visible_text = re.sub(r'\s+', ' ', visible_text)
+        return visible_text.strip()
+    except Exception as e:
+        return str(e)
+@app.get("/")
+async def root(url: str):
+    data = scrape_visible_text_from_url(url)
+    return {"scraped_text": data}

requirements.txt ADDED Viewed

+aiohttp==3.8.5
+aiosignal==1.3.1
+altair==5.0.1
+annotated-types==0.5.0
+anyio==3.7.1
+async-timeout==4.0.2
+attrs==23.1.0
+beautifulsoup4==4.12.2
+blinker==1.6.2
+bs4==0.0.1
+cachetools==5.3.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.6
+decorator==5.1.1
+exceptiongroup==1.1.2
+fastapi==0.100.1
+Flask==2.3.2
+frozenlist==1.4.0
+gitdb==4.0.10
+GitPython==3.1.32
+h11==0.14.0
+httptools==0.6.0
+idna==3.4
+importlib-metadata==6.8.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+jsonschema==4.18.4
+jsonschema-specifications==2023.7.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+multidict==6.0.4
+numpy==1.25.2
+openai==0.27.8
+packaging==23.1
+pandas==2.0.3
+Pillow==9.5.0
+protobuf==4.23.4
+pyarrow==12.0.1
+pydantic==2.1.1
+pydantic_core==2.4.0
+pydeck==0.8.0
+Pygments==2.15.1
+Pympler==1.0.1
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0.1
+referencing==0.30.0
+requests==2.31.0
+rich==13.5.2
+rpds-py==0.9.2
+six==1.16.0
+smmap==5.0.0
+sniffio==1.3.0
+soupsieve==2.4.1
+starlette==0.27.0
+streamlit==1.25.0
+tenacity==8.2.2
+toml==0.10.2
+toolz==0.12.0
+tornado==6.3.2
+tqdm==4.65.0
+typing_extensions==4.7.1
+tzdata==2023.3
+tzlocal==4.3.1
+urllib3==2.0.4
+uvicorn==0.23.2
+uvloop==0.17.0
+validators==0.20.0
+watchdog==3.0.0
+watchfiles==0.19.0
+websockets==11.0.3
+Werkzeug==2.3.6
+yarl==1.9.2
+zipp==3.16.2