Commit
Β·
9fa8d4f
1
Parent(s):
472ecf2
Better Scraping
Browse files
app.py
CHANGED
@@ -13,9 +13,6 @@ from selenium.webdriver.chrome.options import Options
|
|
13 |
from selenium.webdriver.chrome.service import Service
|
14 |
from webdriver_manager.chrome import ChromeDriverManager
|
15 |
|
16 |
-
os.environ["STREAMLIT_SERVER_HEADLESS"] = "1"
|
17 |
-
os.environ["PATH"] += os.pathsep + "/usr/bin"
|
18 |
-
|
19 |
# Load API Key
|
20 |
load_dotenv()
|
21 |
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
@@ -119,8 +116,6 @@ def scrape_with_selenium(url):
|
|
119 |
options.add_argument("--headless")
|
120 |
options.add_argument("--disable-gpu")
|
121 |
options.add_argument("--no-sandbox")
|
122 |
-
options.add_argument("--disable-dev-shm-usage") # Prevents crashes in Docker environments
|
123 |
-
options.binary_location = "/usr/bin/google-chrome" # Ensure correct path on Hugging Face
|
124 |
|
125 |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
126 |
driver.get(url)
|
@@ -170,61 +165,41 @@ def generate_detailed_company_info(company_data):
|
|
170 |
"""Generates an in-depth company breakdown with AI."""
|
171 |
|
172 |
system_prompt = """
|
173 |
-
You are a business research AI
|
174 |
-
- Do not infer
|
175 |
-
- If data is missing,
|
176 |
-
- Structure the response in an **in-depth** manner with examples, explanations, and clear formatting.
|
177 |
"""
|
178 |
|
179 |
-
|
180 |
-
Based on the extracted
|
181 |
|
182 |
-
## **Company
|
183 |
-
-
|
184 |
-
-
|
185 |
-
- Include when the company was founded and its headquarters location if available.
|
186 |
-
- If historical background is mentioned, provide details.
|
187 |
|
188 |
## **Mission & Vision**
|
189 |
-
- Clearly state the company's mission and vision
|
190 |
- If missing, state **"Data Not Available"**.
|
191 |
|
192 |
-
## **
|
193 |
-
- List
|
194 |
-
|
195 |
-
- Explain how each product/service benefits customers and any use cases mentioned.
|
196 |
-
- Include specific names of services or product categories if mentioned.
|
197 |
-
|
198 |
## **Target Audience**
|
199 |
-
- Define
|
200 |
-
- Include customer demographics, industries served, or types of businesses targeted.
|
201 |
-
- If missing, state **"Data Not Available"**.
|
202 |
|
203 |
## **Business Model & Revenue Streams**
|
204 |
-
- Describe
|
205 |
-
- Provide pricing models if available (monthly subscriptions, enterprise pricing, etc.).
|
206 |
-
- If missing, state **"Data Not Available"**.
|
207 |
|
208 |
-
## **Competitive Edge & Market
|
209 |
-
-
|
210 |
-
- Highlight any unique features, patents, technology, or innovations.
|
211 |
-
- Provide market position insights if mentioned.
|
212 |
|
213 |
-
## **
|
214 |
-
-
|
215 |
-
- If available, summarize success stories or case studies provided in the content.
|
216 |
-
- If missing, state **"Data Not Available"**.
|
217 |
-
|
218 |
-
## **Industry Impact & Thought Leadership**
|
219 |
-
- Summarize the company's contributions to its industry, including research, blog posts, or innovations.
|
220 |
-
- Mention any industry insights, events, or speaking engagements.
|
221 |
-
- If missing, state **"Data Not Available"**.
|
222 |
|
223 |
-
**Extracted
|
224 |
{company_data}
|
225 |
"""
|
226 |
|
227 |
-
|
228 |
responses = []
|
229 |
if len(company_data) > CHUNK_SIZE:
|
230 |
st.warning("π Large content detected! Splitting into multiple AI requests.")
|
@@ -232,23 +207,23 @@ def generate_detailed_company_info(company_data):
|
|
232 |
|
233 |
for i, chunk in enumerate(chunks):
|
234 |
st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
|
235 |
-
prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human",
|
236 |
chain = prompt | chat
|
237 |
-
response = chain.invoke({"text":
|
238 |
responses.append(response.content)
|
239 |
|
240 |
return "\n\n".join(responses)
|
241 |
|
242 |
else:
|
243 |
-
prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human",
|
244 |
chain = prompt | chat
|
245 |
-
response = chain.invoke({"text":
|
246 |
return response.content
|
247 |
|
248 |
# β
**Streamlit UI**
|
249 |
def main():
|
250 |
-
st.title("Company Website Scraper
|
251 |
-
base_url = st.text_input("Enter
|
252 |
|
253 |
if st.button("Scrape"):
|
254 |
if base_url:
|
|
|
13 |
from selenium.webdriver.chrome.service import Service
|
14 |
from webdriver_manager.chrome import ChromeDriverManager
|
15 |
|
|
|
|
|
|
|
16 |
# Load API Key
|
17 |
load_dotenv()
|
18 |
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
|
|
116 |
options.add_argument("--headless")
|
117 |
options.add_argument("--disable-gpu")
|
118 |
options.add_argument("--no-sandbox")
|
|
|
|
|
119 |
|
120 |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
121 |
driver.get(url)
|
|
|
165 |
"""Generates an in-depth company breakdown with AI."""
|
166 |
|
167 |
system_prompt = """
|
168 |
+
You are a business research AI. Provide **detailed** insights strictly from the extracted company data.
|
169 |
+
- Do **not** infer missing details.
|
170 |
+
- If data is missing, label it as **"Data Not Available"**.
|
|
|
171 |
"""
|
172 |
|
173 |
+
user_prompt_template = f"""
|
174 |
+
Based on the extracted content, **generate a structured company analysis**:
|
175 |
|
176 |
+
## **Company Overview**
|
177 |
+
- Full company name, industry, and key differentiators.
|
178 |
+
- Headquarters location & founding year (if available).
|
|
|
|
|
179 |
|
180 |
## **Mission & Vision**
|
181 |
+
- Clearly state the company's mission and vision.
|
182 |
- If missing, state **"Data Not Available"**.
|
183 |
|
184 |
+
## **Products & Services**
|
185 |
+
- List major products/services and their benefits.
|
186 |
+
|
|
|
|
|
|
|
187 |
## **Target Audience**
|
188 |
+
- Define customer demographics or industries served.
|
|
|
|
|
189 |
|
190 |
## **Business Model & Revenue Streams**
|
191 |
+
- Describe revenue model (e.g., SaaS, B2B, freemium).
|
|
|
|
|
192 |
|
193 |
+
## **Competitive Edge & Market Position**
|
194 |
+
- Highlight unique features, patents, and innovations.
|
|
|
|
|
195 |
|
196 |
+
## **Clients & Industry Impact**
|
197 |
+
- Notable clients, case studies, or market influence.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
+
**Extracted Data:**
|
200 |
{company_data}
|
201 |
"""
|
202 |
|
|
|
203 |
responses = []
|
204 |
if len(company_data) > CHUNK_SIZE:
|
205 |
st.warning("π Large content detected! Splitting into multiple AI requests.")
|
|
|
207 |
|
208 |
for i, chunk in enumerate(chunks):
|
209 |
st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
|
210 |
+
prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
|
211 |
chain = prompt | chat
|
212 |
+
response = chain.invoke({"text": user_prompt_template.format(company_data=chunk)})
|
213 |
responses.append(response.content)
|
214 |
|
215 |
return "\n\n".join(responses)
|
216 |
|
217 |
else:
|
218 |
+
prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
|
219 |
chain = prompt | chat
|
220 |
+
response = chain.invoke({"text": user_prompt_template})
|
221 |
return response.content
|
222 |
|
223 |
# β
**Streamlit UI**
|
224 |
def main():
|
225 |
+
st.title("π AI-Powered Company Website Scraper")
|
226 |
+
base_url = st.text_input("π Enter Website URL", "")
|
227 |
|
228 |
if st.button("Scrape"):
|
229 |
if base_url:
|