File size: 3,726 Bytes
53dbd29
 
 
 
3ef921d
 
 
53dbd29
3ef921d
53dbd29
3ef921d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53dbd29
 
3ef921d
53dbd29
 
 
3ef921d
 
 
 
 
 
 
 
 
 
bedee8a
3ef921d
 
 
53dbd29
3ef921d
 
53dbd29
3ef921d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53dbd29
3ef921d
53dbd29
 
 
 
 
00214e2
53dbd29
00214e2
53dbd29
3ef921d
 
 
 
 
 
 
 
00214e2
53dbd29
3ef921d
 
 
 
 
 
 
53dbd29
 
3ef921d
53dbd29
 
 
 
 
 
 
3ef921d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
from requests.sessions import Session
from langdetect import detect
from googletrans import Translator

def scrape_visible_text_from_url(url, query_selector=None, email=None, password=None, login_url=None):
    try:
        session = Session()

        # Handle authentication if credentials are provided
        if email and password and login_url:
            login_data = {
                'email': email,
                'password': password
                # Include other necessary fields as required by the website
            }
            response = session.post(login_url, data=login_data)
            response.raise_for_status()
        else:
            response = session.get(url)
            response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove unwanted tags
        for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
            tag.extract()

        # Use query selector if provided
        if query_selector:
            elements = soup.select(query_selector)
            text_content = " ".join([element.get_text() for element in elements])
        else:
            # Extract header content
            header_content = soup.find("header")
            header_text = header_content.get_text() if header_content else ""

            # Extract paragraph content
            paragraph_content = soup.body
            paragraph_text = " ".join([p.get_text() for p in paragraph_content])

            text_content = f"{header_text}\n\n{paragraph_text}"

        # Clean up whitespace
        visible_text = re.sub(r'\s+', ' ', text_content).strip()

        # Translate non-English text
        translator = Translator()
        sentences = re.split(r'(?<=[.!?]) +', visible_text)
        translated_sentences = []
        for sentence in sentences:
            try:
                lang = detect(sentence)
                if lang != 'en':
                    translation = translator.translate(sentence, dest='en').text
                    translated_sentences.append(translation)
                else:
                    translated_sentences.append(sentence)
            except Exception:
                translated_sentences.append(sentence)
        translated_text = ' '.join(translated_sentences)

        return translated_text
    except Exception as e:
        st.error(f"Error occurred while scraping the data: {e}")
        return None

def main():
    st.title("๐ŸŒ Web Data Scraper")

    url_input = st.text_input("Enter the URL :", "")

    query_selector = st.text_input("Enter a query selector (optional):", "")

    email = st.text_input("Email (if authentication required):", "")

    password = st.text_input("Password (if authentication required):", "", type="password")

    login_url = st.text_input("Enter the login URL (if authentication required):", "")

    if st.button("๐Ÿ’ฟ Load Data"):
        if url_input:
            data = scrape_visible_text_from_url(
                url=url_input,
                query_selector=query_selector if query_selector else None,
                email=email if email else None,
                password=password if password else None,
                login_url=login_url if login_url else None
            )
            if data:
                st.success("Data text successfully scraped!")
                st.subheader("Scraped Text:")
                st.write(data)
            else:
                st.warning("Failed to load data from the URL.")
        else:
            st.warning("Please enter a valid URL.")

if __name__ == "__main__":
    main()