Spaces:

kushagrasharma-13
/

company-details-scraper

Running

App Files Files Community

kushagrasharma-13 commited on 28 days ago

Commit

5f2db56

1 Parent(s): 9fa8d4f

Better Scraping

Browse files

Files changed (1) hide show

app.py +10 -10

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ HEADERS = {
     "Referer": "https://www.google.com/"
 }
-# ✅ **Extract Links**
 def get_valid_links(base_url):
     """Extracts all internal links, including footer and JavaScript-rendered links."""
     try:
@@ -58,7 +58,7 @@ def get_valid_links(base_url):
         return links
     except requests.exceptions.RequestException as e:
-        print(f"❌ Error fetching links: {e}")
         return set()
 def check_footer_links(soup):
@@ -91,10 +91,10 @@ def get_links_with_selenium(url):
         return links
     except Exception as e:
-        print(f"❌ Selenium Error: {e}")
         return set()
-# ✅ **Scrape Pages**
 def scrape_page(url):
     """Scrapes a webpage, using Selenium if necessary."""
     try:
@@ -125,21 +125,21 @@ def scrape_with_selenium(url):
         return extract_text(soup)
     except Exception as e:
-        return f"❌ Selenium Scraping Error: {e}"
 def extract_text(soup):
     """Extracts **all** meaningful text from HTML content, including dynamic elements."""
-    # ✅ Extracts all text from the HTML, not just specific tags
     all_text = soup.get_text(separator="\n", strip=True)
-    # ✅ Removes duplicate lines & unwanted spaces
     unique_lines = set(all_text.split("\n"))
     cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3)  # Exclude tiny fragments
     return cleaned_text
-# ✅ **Chunking for Large AI Requests**
 def split_into_chunks(text, chunk_size):
     """Splits long content into manageable chunks for AI processing."""
     words = text.split()
@@ -160,7 +160,7 @@ def split_into_chunks(text, chunk_size):
     return chunks
-# ✅ **AI-Powered Company Breakdown**
 def generate_detailed_company_info(company_data):
     """Generates an in-depth company breakdown with AI."""
@@ -220,7 +220,7 @@ def generate_detailed_company_info(company_data):
         response = chain.invoke({"text": user_prompt_template})
         return response.content
-# ✅ **Streamlit UI**
 def main():
     st.title("🚀 AI-Powered Company Website Scraper")
     base_url = st.text_input("🔗 Enter Website URL", "")

     "Referer": "https://www.google.com/"
 }
+#  **Extract Links**
 def get_valid_links(base_url):
     """Extracts all internal links, including footer and JavaScript-rendered links."""
     try:
         return links
     except requests.exceptions.RequestException as e:
+        print(f" Error fetching links: {e}")
         return set()
 def check_footer_links(soup):
         return links
     except Exception as e:
+        print(f" Selenium Error: {e}")
         return set()
+#  **Scrape Pages**
 def scrape_page(url):
     """Scrapes a webpage, using Selenium if necessary."""
     try:
         return extract_text(soup)
     except Exception as e:
+        return f" Selenium Scraping Error: {e}"
 def extract_text(soup):
     """Extracts **all** meaningful text from HTML content, including dynamic elements."""
+    #  Extracts all text from the HTML, not just specific tags
     all_text = soup.get_text(separator="\n", strip=True)
+    #  Removes duplicate lines & unwanted spaces
     unique_lines = set(all_text.split("\n"))
     cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3)  # Exclude tiny fragments
     return cleaned_text
+#  **Chunking for Large AI Requests**
 def split_into_chunks(text, chunk_size):
     """Splits long content into manageable chunks for AI processing."""
     words = text.split()
     return chunks
+#  **AI-Powered Company Breakdown**
 def generate_detailed_company_info(company_data):
     """Generates an in-depth company breakdown with AI."""
         response = chain.invoke({"text": user_prompt_template})
         return response.content
+#  **Streamlit UI**
 def main():
     st.title("🚀 AI-Powered Company Website Scraper")
     base_url = st.text_input("🔗 Enter Website URL", "")