kushagrasharma-13 commited on
Commit
9fa8d4f
Β·
1 Parent(s): 472ecf2

Better Scraping

Browse files
Files changed (1) hide show
  1. app.py +25 -50
app.py CHANGED
@@ -13,9 +13,6 @@ from selenium.webdriver.chrome.options import Options
13
  from selenium.webdriver.chrome.service import Service
14
  from webdriver_manager.chrome import ChromeDriverManager
15
 
16
- os.environ["STREAMLIT_SERVER_HEADLESS"] = "1"
17
- os.environ["PATH"] += os.pathsep + "/usr/bin"
18
-
19
  # Load API Key
20
  load_dotenv()
21
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
@@ -119,8 +116,6 @@ def scrape_with_selenium(url):
119
  options.add_argument("--headless")
120
  options.add_argument("--disable-gpu")
121
  options.add_argument("--no-sandbox")
122
- options.add_argument("--disable-dev-shm-usage") # Prevents crashes in Docker environments
123
- options.binary_location = "/usr/bin/google-chrome" # Ensure correct path on Hugging Face
124
 
125
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
126
  driver.get(url)
@@ -170,61 +165,41 @@ def generate_detailed_company_info(company_data):
170
  """Generates an in-depth company breakdown with AI."""
171
 
172
  system_prompt = """
173
- You are a business research AI that provides **detailed** information strictly based on the extracted company data.
174
- - Do not infer or create information that is not explicitly mentioned in the content.
175
- - If data is missing, clearly label it as **"Data Not Available"**.
176
- - Structure the response in an **in-depth** manner with examples, explanations, and clear formatting.
177
  """
178
 
179
- user_prompt = f"""
180
- Based on the extracted company content provided below, **generate an in-depth company breakdown** following this detailed format:
181
 
182
- ## **Company Name & Overview**
183
- - Provide the full company name.
184
- - Describe what the company does, including its industry, market position, and any key differentiators.
185
- - Include when the company was founded and its headquarters location if available.
186
- - If historical background is mentioned, provide details.
187
 
188
  ## **Mission & Vision**
189
- - Clearly state the company's mission and vision as mentioned in the provided content.
190
  - If missing, state **"Data Not Available"**.
191
 
192
- ## **Detailed Product & Service Overview**
193
- - List all major products or services the company offers.
194
- - Provide descriptions of each product/service, highlighting unique features.
195
- - Explain how each product/service benefits customers and any use cases mentioned.
196
- - Include specific names of services or product categories if mentioned.
197
-
198
  ## **Target Audience**
199
- - Define the primary users of the company's offerings.
200
- - Include customer demographics, industries served, or types of businesses targeted.
201
- - If missing, state **"Data Not Available"**.
202
 
203
  ## **Business Model & Revenue Streams**
204
- - Describe how the company makes money (e.g., subscriptions, one-time sales, freemium models, SaaS, B2B, etc.).
205
- - Provide pricing models if available (monthly subscriptions, enterprise pricing, etc.).
206
- - If missing, state **"Data Not Available"**.
207
 
208
- ## **Competitive Edge & Market Differentiation**
209
- - Explain what sets the company apart from competitors.
210
- - Highlight any unique features, patents, technology, or innovations.
211
- - Provide market position insights if mentioned.
212
 
213
- ## **Notable Clients & Case Studies**
214
- - List any key clients or industries that the company serves.
215
- - If available, summarize success stories or case studies provided in the content.
216
- - If missing, state **"Data Not Available"**.
217
-
218
- ## **Industry Impact & Thought Leadership**
219
- - Summarize the company's contributions to its industry, including research, blog posts, or innovations.
220
- - Mention any industry insights, events, or speaking engagements.
221
- - If missing, state **"Data Not Available"**.
222
 
223
- **Extracted Company Content:**
224
  {company_data}
225
  """
226
 
227
-
228
  responses = []
229
  if len(company_data) > CHUNK_SIZE:
230
  st.warning("πŸ”„ Large content detected! Splitting into multiple AI requests.")
@@ -232,23 +207,23 @@ def generate_detailed_company_info(company_data):
232
 
233
  for i, chunk in enumerate(chunks):
234
  st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
235
- prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt)])
236
  chain = prompt | chat
237
- response = chain.invoke({"text": user_prompt.format(company_data=chunk)})
238
  responses.append(response.content)
239
 
240
  return "\n\n".join(responses)
241
 
242
  else:
243
- prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt)])
244
  chain = prompt | chat
245
- response = chain.invoke({"text": user_prompt})
246
  return response.content
247
 
248
  # βœ… **Streamlit UI**
249
  def main():
250
- st.title("Company Website Scraper with AI Research")
251
- base_url = st.text_input("Enter a company's website URL, and this tool will extract all relevant information and generate a detailed business breakdown using Groq AI.", "")
252
 
253
  if st.button("Scrape"):
254
  if base_url:
 
13
  from selenium.webdriver.chrome.service import Service
14
  from webdriver_manager.chrome import ChromeDriverManager
15
 
 
 
 
16
  # Load API Key
17
  load_dotenv()
18
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 
116
  options.add_argument("--headless")
117
  options.add_argument("--disable-gpu")
118
  options.add_argument("--no-sandbox")
 
 
119
 
120
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
121
  driver.get(url)
 
165
  """Generates an in-depth company breakdown with AI."""
166
 
167
  system_prompt = """
168
+ You are a business research AI. Provide **detailed** insights strictly from the extracted company data.
169
+ - Do **not** infer missing details.
170
+ - If data is missing, label it as **"Data Not Available"**.
 
171
  """
172
 
173
+ user_prompt_template = f"""
174
+ Based on the extracted content, **generate a structured company analysis**:
175
 
176
+ ## **Company Overview**
177
+ - Full company name, industry, and key differentiators.
178
+ - Headquarters location & founding year (if available).
 
 
179
 
180
  ## **Mission & Vision**
181
+ - Clearly state the company's mission and vision.
182
  - If missing, state **"Data Not Available"**.
183
 
184
+ ## **Products & Services**
185
+ - List major products/services and their benefits.
186
+
 
 
 
187
  ## **Target Audience**
188
+ - Define customer demographics or industries served.
 
 
189
 
190
  ## **Business Model & Revenue Streams**
191
+ - Describe revenue model (e.g., SaaS, B2B, freemium).
 
 
192
 
193
+ ## **Competitive Edge & Market Position**
194
+ - Highlight unique features, patents, and innovations.
 
 
195
 
196
+ ## **Clients & Industry Impact**
197
+ - Notable clients, case studies, or market influence.
 
 
 
 
 
 
 
198
 
199
+ **Extracted Data:**
200
  {company_data}
201
  """
202
 
 
203
  responses = []
204
  if len(company_data) > CHUNK_SIZE:
205
  st.warning("πŸ”„ Large content detected! Splitting into multiple AI requests.")
 
207
 
208
  for i, chunk in enumerate(chunks):
209
  st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
210
+ prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
211
  chain = prompt | chat
212
+ response = chain.invoke({"text": user_prompt_template.format(company_data=chunk)})
213
  responses.append(response.content)
214
 
215
  return "\n\n".join(responses)
216
 
217
  else:
218
+ prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
219
  chain = prompt | chat
220
+ response = chain.invoke({"text": user_prompt_template})
221
  return response.content
222
 
223
  # βœ… **Streamlit UI**
224
  def main():
225
+ st.title("πŸš€ AI-Powered Company Website Scraper")
226
+ base_url = st.text_input("πŸ”— Enter Website URL", "")
227
 
228
  if st.button("Scrape"):
229
  if base_url: