shreyasiv commited on
Commit
f0779c6
·
1 Parent(s): c1a162b

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +30 -0
  2. fastapi_server.py +40 -0
  3. requirements.txt +78 -0
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # streamlit_app.py
2
+ import streamlit as st
3
+ import requests
4
+
5
+ # Streamlit UI
6
+ def main():
7
+ st.title("Web Data Scraper")
8
+
9
+ # Get the URL from the user
10
+ url_input = st.text_input("Enter the URL of the web page:", "")
11
+
12
+ if st.button("Scrape Visible Text"):
13
+ if url_input:
14
+ # Make a GET request to the FastAPI server with the URL
15
+ response = requests.get(f"http://localhost:8000/?url={url_input}")
16
+ if response.status_code == 200:
17
+ data = response.json()
18
+ if "scraped_text" in data:
19
+ st.success("Visible text successfully scraped!")
20
+ st.subheader("Scraped Text:")
21
+ st.write(data["scraped_text"])
22
+ else:
23
+ st.warning("Failed to scrape visible text from the URL.")
24
+ else:
25
+ st.warning("Failed to connect to the FastAPI server.")
26
+ else:
27
+ st.warning("Please enter a valid URL.")
28
+
29
+ if __name__ == "__main__":
30
+ main()
fastapi_server.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # fastapi_server.py
2
+ from fastapi import FastAPI
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+
7
+ app = FastAPI()
8
+
9
+ # Function to scrape only visible text from the given URL
10
+ def scrape_visible_text_from_url(url):
11
+ try:
12
+ response = requests.get(url)
13
+ response.raise_for_status()
14
+ soup = BeautifulSoup(response.content, 'html.parser')
15
+
16
+ # Remove script, style, and other non-visible tags
17
+ for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
18
+ tag.extract()
19
+
20
+ # Get the header content
21
+ header_content = soup.find("header")
22
+ header_text = header_content.get_text() if header_content else ""
23
+
24
+ # Get the paragraph content
25
+ paragraph_content = soup.find_all("p")
26
+ paragraph_text = " ".join([p.get_text() for p in paragraph_content])
27
+
28
+ # Combine header and paragraph text
29
+ visible_text = f"{header_text}\n\n{paragraph_text}"
30
+
31
+ # Remove multiple whitespaces and newlines
32
+ visible_text = re.sub(r'\s+', ' ', visible_text)
33
+ return visible_text.strip()
34
+ except Exception as e:
35
+ return str(e)
36
+
37
+ @app.get("/")
38
+ async def root(url: str):
39
+ data = scrape_visible_text_from_url(url)
40
+ return {"scraped_text": data}
requirements.txt ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.5
2
+ aiosignal==1.3.1
3
+ altair==5.0.1
4
+ annotated-types==0.5.0
5
+ anyio==3.7.1
6
+ async-timeout==4.0.2
7
+ attrs==23.1.0
8
+ beautifulsoup4==4.12.2
9
+ blinker==1.6.2
10
+ bs4==0.0.1
11
+ cachetools==5.3.1
12
+ certifi==2023.7.22
13
+ charset-normalizer==3.2.0
14
+ click==8.1.6
15
+ decorator==5.1.1
16
+ exceptiongroup==1.1.2
17
+ fastapi==0.100.1
18
+ Flask==2.3.2
19
+ frozenlist==1.4.0
20
+ gitdb==4.0.10
21
+ GitPython==3.1.32
22
+ h11==0.14.0
23
+ httptools==0.6.0
24
+ idna==3.4
25
+ importlib-metadata==6.8.0
26
+ itsdangerous==2.1.2
27
+ Jinja2==3.1.2
28
+ jsonschema==4.18.4
29
+ jsonschema-specifications==2023.7.1
30
+ markdown-it-py==3.0.0
31
+ MarkupSafe==2.1.3
32
+ mdurl==0.1.2
33
+ multidict==6.0.4
34
+ numpy==1.25.2
35
+ openai==0.27.8
36
+ packaging==23.1
37
+ pandas==2.0.3
38
+ Pillow==9.5.0
39
+ protobuf==4.23.4
40
+ pyarrow==12.0.1
41
+ pydantic==2.1.1
42
+ pydantic_core==2.4.0
43
+ pydeck==0.8.0
44
+ Pygments==2.15.1
45
+ Pympler==1.0.1
46
+ python-dateutil==2.8.2
47
+ python-dotenv==1.0.0
48
+ pytz==2023.3
49
+ pytz-deprecation-shim==0.1.0.post0
50
+ PyYAML==6.0.1
51
+ referencing==0.30.0
52
+ requests==2.31.0
53
+ rich==13.5.2
54
+ rpds-py==0.9.2
55
+ six==1.16.0
56
+ smmap==5.0.0
57
+ sniffio==1.3.0
58
+ soupsieve==2.4.1
59
+ starlette==0.27.0
60
+ streamlit==1.25.0
61
+ tenacity==8.2.2
62
+ toml==0.10.2
63
+ toolz==0.12.0
64
+ tornado==6.3.2
65
+ tqdm==4.65.0
66
+ typing_extensions==4.7.1
67
+ tzdata==2023.3
68
+ tzlocal==4.3.1
69
+ urllib3==2.0.4
70
+ uvicorn==0.23.2
71
+ uvloop==0.17.0
72
+ validators==0.20.0
73
+ watchdog==3.0.0
74
+ watchfiles==0.19.0
75
+ websockets==11.0.3
76
+ Werkzeug==2.3.6
77
+ yarl==1.9.2
78
+ zipp==3.16.2