Commit
Β·
5f2db56
1
Parent(s):
9fa8d4f
Better Scraping
Browse files
app.py
CHANGED
@@ -36,7 +36,7 @@ HEADERS = {
|
|
36 |
"Referer": "https://www.google.com/"
|
37 |
}
|
38 |
|
39 |
-
#
|
40 |
def get_valid_links(base_url):
|
41 |
"""Extracts all internal links, including footer and JavaScript-rendered links."""
|
42 |
try:
|
@@ -58,7 +58,7 @@ def get_valid_links(base_url):
|
|
58 |
|
59 |
return links
|
60 |
except requests.exceptions.RequestException as e:
|
61 |
-
print(f"
|
62 |
return set()
|
63 |
|
64 |
def check_footer_links(soup):
|
@@ -91,10 +91,10 @@ def get_links_with_selenium(url):
|
|
91 |
|
92 |
return links
|
93 |
except Exception as e:
|
94 |
-
print(f"
|
95 |
return set()
|
96 |
|
97 |
-
#
|
98 |
def scrape_page(url):
|
99 |
"""Scrapes a webpage, using Selenium if necessary."""
|
100 |
try:
|
@@ -125,21 +125,21 @@ def scrape_with_selenium(url):
|
|
125 |
|
126 |
return extract_text(soup)
|
127 |
except Exception as e:
|
128 |
-
return f"
|
129 |
|
130 |
def extract_text(soup):
|
131 |
"""Extracts **all** meaningful text from HTML content, including dynamic elements."""
|
132 |
|
133 |
-
#
|
134 |
all_text = soup.get_text(separator="\n", strip=True)
|
135 |
|
136 |
-
#
|
137 |
unique_lines = set(all_text.split("\n"))
|
138 |
cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
|
139 |
|
140 |
return cleaned_text
|
141 |
|
142 |
-
#
|
143 |
def split_into_chunks(text, chunk_size):
|
144 |
"""Splits long content into manageable chunks for AI processing."""
|
145 |
words = text.split()
|
@@ -160,7 +160,7 @@ def split_into_chunks(text, chunk_size):
|
|
160 |
|
161 |
return chunks
|
162 |
|
163 |
-
#
|
164 |
def generate_detailed_company_info(company_data):
|
165 |
"""Generates an in-depth company breakdown with AI."""
|
166 |
|
@@ -220,7 +220,7 @@ def generate_detailed_company_info(company_data):
|
|
220 |
response = chain.invoke({"text": user_prompt_template})
|
221 |
return response.content
|
222 |
|
223 |
-
#
|
224 |
def main():
|
225 |
st.title("π AI-Powered Company Website Scraper")
|
226 |
base_url = st.text_input("π Enter Website URL", "")
|
|
|
36 |
"Referer": "https://www.google.com/"
|
37 |
}
|
38 |
|
39 |
+
# **Extract Links**
|
40 |
def get_valid_links(base_url):
|
41 |
"""Extracts all internal links, including footer and JavaScript-rendered links."""
|
42 |
try:
|
|
|
58 |
|
59 |
return links
|
60 |
except requests.exceptions.RequestException as e:
|
61 |
+
print(f" Error fetching links: {e}")
|
62 |
return set()
|
63 |
|
64 |
def check_footer_links(soup):
|
|
|
91 |
|
92 |
return links
|
93 |
except Exception as e:
|
94 |
+
print(f" Selenium Error: {e}")
|
95 |
return set()
|
96 |
|
97 |
+
# **Scrape Pages**
|
98 |
def scrape_page(url):
|
99 |
"""Scrapes a webpage, using Selenium if necessary."""
|
100 |
try:
|
|
|
125 |
|
126 |
return extract_text(soup)
|
127 |
except Exception as e:
|
128 |
+
return f" Selenium Scraping Error: {e}"
|
129 |
|
130 |
def extract_text(soup):
|
131 |
"""Extracts **all** meaningful text from HTML content, including dynamic elements."""
|
132 |
|
133 |
+
# Extracts all text from the HTML, not just specific tags
|
134 |
all_text = soup.get_text(separator="\n", strip=True)
|
135 |
|
136 |
+
# Removes duplicate lines & unwanted spaces
|
137 |
unique_lines = set(all_text.split("\n"))
|
138 |
cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments
|
139 |
|
140 |
return cleaned_text
|
141 |
|
142 |
+
# **Chunking for Large AI Requests**
|
143 |
def split_into_chunks(text, chunk_size):
|
144 |
"""Splits long content into manageable chunks for AI processing."""
|
145 |
words = text.split()
|
|
|
160 |
|
161 |
return chunks
|
162 |
|
163 |
+
# **AI-Powered Company Breakdown**
|
164 |
def generate_detailed_company_info(company_data):
|
165 |
"""Generates an in-depth company breakdown with AI."""
|
166 |
|
|
|
220 |
response = chain.invoke({"text": user_prompt_template})
|
221 |
return response.content
|
222 |
|
223 |
+
# **Streamlit UI**
|
224 |
def main():
|
225 |
st.title("π AI-Powered Company Website Scraper")
|
226 |
base_url = st.text_input("π Enter Website URL", "")
|