Spaces:

kushagrasharma-13
/

company-details-scraper

Running

App Files Files Community

kushagrasharma-13 commited on Jan 10

Commit

9fa8d4f

1 Parent(s): 472ecf2

Better Scraping

Browse files

Files changed (1) hide show

app.py +25 -50

app.py CHANGED Viewed

@@ -13,9 +13,6 @@ from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
-os.environ["STREAMLIT_SERVER_HEADLESS"] = "1"
-os.environ["PATH"] += os.pathsep + "/usr/bin"
 # Load API Key
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
@@ -119,8 +116,6 @@ def scrape_with_selenium(url):
         options.add_argument("--headless")
         options.add_argument("--disable-gpu")
         options.add_argument("--no-sandbox")
-        options.add_argument("--disable-dev-shm-usage")  # Prevents crashes in Docker environments
-        options.binary_location = "/usr/bin/google-chrome"  # Ensure correct path on Hugging Face
         driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
         driver.get(url)
@@ -170,61 +165,41 @@ def generate_detailed_company_info(company_data):
     """Generates an in-depth company breakdown with AI."""
     system_prompt = """
-    You are a business research AI that provides **detailed** information strictly based on the extracted company data.
-    - Do not infer or create information that is not explicitly mentioned in the content.
-    - If data is missing, clearly label it as **"Data Not Available"**.
-    - Structure the response in an **in-depth** manner with examples, explanations, and clear formatting.
     """
-    user_prompt = f"""
-    Based on the extracted company content provided below, **generate an in-depth company breakdown** following this detailed format:
-    ## **Company Name & Overview**
-    - Provide the full company name.
-    - Describe what the company does, including its industry, market position, and any key differentiators.
-    - Include when the company was founded and its headquarters location if available.
-    - If historical background is mentioned, provide details.
     ## **Mission & Vision**
-    - Clearly state the company's mission and vision as mentioned in the provided content.
     - If missing, state **"Data Not Available"**.
-    ## **Detailed Product & Service Overview**
-    - List all major products or services the company offers.
-    - Provide descriptions of each product/service, highlighting unique features.
-    - Explain how each product/service benefits customers and any use cases mentioned.
-    - Include specific names of services or product categories if mentioned.
     ## **Target Audience**
-    - Define the primary users of the company's offerings.
-    - Include customer demographics, industries served, or types of businesses targeted.
-    - If missing, state **"Data Not Available"**.
     ## **Business Model & Revenue Streams**
-    - Describe how the company makes money (e.g., subscriptions, one-time sales, freemium models, SaaS, B2B, etc.).
-    - Provide pricing models if available (monthly subscriptions, enterprise pricing, etc.).
-    - If missing, state **"Data Not Available"**.
-    ## **Competitive Edge & Market Differentiation**
-    - Explain what sets the company apart from competitors.
-    - Highlight any unique features, patents, technology, or innovations.
-    - Provide market position insights if mentioned.
-    ## **Notable Clients & Case Studies**
-    - List any key clients or industries that the company serves.
-    - If available, summarize success stories or case studies provided in the content.
-    - If missing, state **"Data Not Available"**.
-    ## **Industry Impact & Thought Leadership**
-    - Summarize the company's contributions to its industry, including research, blog posts, or innovations.
-    - Mention any industry insights, events, or speaking engagements.
-    - If missing, state **"Data Not Available"**.
-    **Extracted Company Content:**
     {company_data}
     """
     responses = []
     if len(company_data) > CHUNK_SIZE:
         st.warning("🔄 Large content detected! Splitting into multiple AI requests.")
@@ -232,23 +207,23 @@ def generate_detailed_company_info(company_data):
         for i, chunk in enumerate(chunks):
             st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
-            prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt)])
             chain = prompt | chat
-            response = chain.invoke({"text": user_prompt.format(company_data=chunk)})
             responses.append(response.content)
         return "\n\n".join(responses)
     else:
-        prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt)])
         chain = prompt | chat
-        response = chain.invoke({"text": user_prompt})
         return response.content
 # ✅ **Streamlit UI**
 def main():
-    st.title("Company Website Scraper with AI Research")
-    base_url = st.text_input("Enter a company's website URL, and this tool will extract all relevant information and generate a detailed business breakdown using Groq AI.", "")
     if st.button("Scrape"):
         if base_url:

 from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
 # Load API Key
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
         options.add_argument("--headless")
         options.add_argument("--disable-gpu")
         options.add_argument("--no-sandbox")
         driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
         driver.get(url)
     """Generates an in-depth company breakdown with AI."""
     system_prompt = """
+    You are a business research AI. Provide **detailed** insights strictly from the extracted company data.
+    - Do **not** infer missing details.
+    - If data is missing, label it as **"Data Not Available"**.
     """
+    user_prompt_template = f"""
+    Based on the extracted content, **generate a structured company analysis**:
+    ## **Company Overview**
+    - Full company name, industry, and key differentiators.
+    - Headquarters location & founding year (if available).
     ## **Mission & Vision**
+    - Clearly state the company's mission and vision.
     - If missing, state **"Data Not Available"**.
+    ## **Products & Services**
+    - List major products/services and their benefits.
     ## **Target Audience**
+    - Define customer demographics or industries served.
     ## **Business Model & Revenue Streams**
+    - Describe revenue model (e.g., SaaS, B2B, freemium).
+    ## **Competitive Edge & Market Position**
+    - Highlight unique features, patents, and innovations.
+    ## **Clients & Industry Impact**
+    - Notable clients, case studies, or market influence.
+    **Extracted Data:**
     {company_data}
     """
     responses = []
     if len(company_data) > CHUNK_SIZE:
         st.warning("🔄 Large content detected! Splitting into multiple AI requests.")
         for i, chunk in enumerate(chunks):
             st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
+            prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
             chain = prompt | chat
+            response = chain.invoke({"text": user_prompt_template.format(company_data=chunk)})
             responses.append(response.content)
         return "\n\n".join(responses)
     else:
+        prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
         chain = prompt | chat
+        response = chain.invoke({"text": user_prompt_template})
         return response.content
 # ✅ **Streamlit UI**
 def main():
+    st.title("🚀 AI-Powered Company Website Scraper")
+    base_url = st.text_input("🔗 Enter Website URL", "")
     if st.button("Scrape"):
         if base_url: