Spaces:

kushagrasharma-13
/

company-details-scraper

Running

App Files Files Community

company-details-scraper / app.py

kushagrasharma-13

Update app.py

d3a901d verified 28 days ago

raw

history blame contribute delete

8.68 kB

	import os
	import time
	import requests
	import cloudscraper
	import streamlit as st
	from bs4 import BeautifulSoup
	from dotenv import load_dotenv
	from selenium import webdriver
	from langchain_groq import ChatGroq
	from urllib.parse import urljoin, urlparse
	from langchain_core.prompts import ChatPromptTemplate
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager

	# Load API Key
	load_dotenv()
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")

	if not GROQ_API_KEY:
	st.error("Error: Groq API Key is missing. Please set 'GROQ_API_KEY' as an environment variable.")

	chat = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama-3.3-70b-versatile")

	# Model Token Limits
	MODEL_TOKEN_LIMIT = 32500
	CHUNK_SIZE = 15000

	# Initialize Cloudscraper
	scraper = cloudscraper.create_scraper()

	# Headers to mimic real browser requests
	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	}

	# ✅ Extract Links
	def get_valid_links(base_url):
	"""Extracts all internal links, including footer and JavaScript-rendered links."""
	try:
	response = scraper.get(base_url, headers=HEADERS, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	domain = urlparse(base_url).netloc

	links = set()
	for link in soup.find_all('a', href=True):
	full_url = urljoin(base_url, link.get('href'))
	if domain in urlparse(full_url).netloc:
	links.add(full_url)

	# If few links are found, fallback to Selenium
	if len(links) < 5 or not check_footer_links(soup):
	selenium_links = get_links_with_selenium(base_url)
	links.update(selenium_links)

	return links
	except requests.exceptions.RequestException as e:
	print(f"❌ Error fetching links: {e}")
	return set()

	def check_footer_links(soup):
	"""Checks if footer links exist."""
	footer = soup.find("footer")
	return footer and footer.find_all("a", href=True)

	def get_links_with_selenium(url):
	"""Extracts JavaScript-rendered links using Selenium."""
	try:
	options = Options()
	options.add_argument("--headless")
	options.add_argument("--disable-gpu")
	options.add_argument("--no-sandbox")

	driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
	driver.get(url)
	time.sleep(5) # Allow JavaScript to load

	soup = BeautifulSoup(driver.page_source, 'html.parser')
	driver.quit()

	links = set()
	domain = urlparse(url).netloc

	for link in soup.find_all('a', href=True):
	full_url = urljoin(url, link.get('href'))
	if domain in urlparse(full_url).netloc:
	links.add(full_url)

	return links
	except Exception as e:
	print(f"❌ Selenium Error: {e}")
	return set()

	# ✅ Scrape Pages
	def scrape_page(url):
	"""Scrapes a webpage, using Selenium if necessary."""
	try:
	response = scraper.get(url, headers=HEADERS, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	if len(soup.get_text(strip=True)) < 500:
	return scrape_with_selenium(url)

	return extract_text(soup)
	except requests.exceptions.RequestException:
	return scrape_with_selenium(url)

	def scrape_with_selenium(url):
	"""Scrapes JavaScript-heavy pages using Selenium."""
	try:
	options = Options()
	options.add_argument("--headless")
	options.add_argument("--disable-gpu")
	options.add_argument("--no-sandbox")

	driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
	driver.get(url)
	time.sleep(5) # Allow JavaScript to load
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	driver.quit()

	return extract_text(soup)
	except Exception as e:
	return f"❌ Selenium Scraping Error: {e}"

	def extract_text(soup):
	"""Extracts all meaningful text from HTML content, including dynamic elements."""

	# ✅ Extracts all text from the HTML, not just specific tags
	all_text = soup.get_text(separator="\n", strip=True)

	# ✅ Removes duplicate lines & unwanted spaces
	unique_lines = set(all_text.split("\n"))
	cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3) # Exclude tiny fragments

	return cleaned_text

	# ✅ Chunking for Large AI Requests
	def split_into_chunks(text, chunk_size):
	"""Splits long content into manageable chunks for AI processing."""
	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0

	for word in words:
	if current_length + len(word) + 1 > chunk_size:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	current_length = 0
	current_chunk.append(word)
	current_length += len(word) + 1

	if current_chunk:
	chunks.append(" ".join(current_chunk))

	return chunks

	# ✅ AI-Powered Company Breakdown
	def generate_detailed_company_info(company_data):
	"""Generates an in-depth company breakdown with AI."""

	system_prompt = """
	You are a business research AI. Provide detailed insights strictly from the extracted company data.
	- Do not infer missing details.
	- If data is missing, label it as "Data Not Available".
	"""

	user_prompt_template = f"""
	Based on the extracted content, generate a structured company analysis:

	## Company Overview
	- Full company name, industry, and key differentiators.
	- Headquarters location & founding year (if available).

	## Mission & Vision
	- Clearly state the company's mission and vision.
	- If missing, state "Data Not Available".

	## Products & Services
	- List major products/services and their benefits.

	## Target Audience
	- Define customer demographics or industries served.

	## Business Model & Revenue Streams
	- Describe revenue model (e.g., SaaS, B2B, freemium).

	## Competitive Edge & Market Position
	- Highlight unique features, patents, and innovations.

	## Clients & Industry Impact
	- Notable clients, case studies, or market influence.

	Extracted Data:
	{company_data}
	"""

	responses = []
	if len(company_data) > CHUNK_SIZE:
	st.warning("🔄 Large content detected! Splitting into multiple AI requests.")
	chunks = split_into_chunks(company_data, CHUNK_SIZE)

	for i, chunk in enumerate(chunks):
	st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
	prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
	chain = prompt \| chat
	response = chain.invoke({"text": user_prompt_template.format(company_data=chunk)})
	responses.append(response.content)

	return "\n\n".join(responses)

	else:
	prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
	chain = prompt \| chat
	response = chain.invoke({"text": user_prompt_template})
	return response.content

	# ✅ Streamlit UI
	def main():
	st.title("🚀 AI-Powered Company Website Scraper")
	base_url = st.text_input("🔗 Enter Website URL", "")

	if st.button("Scrape"):
	if base_url:
	st.write(f"🔍 Scraping: {base_url}... Please wait.")
	valid_links = get_valid_links(base_url)

	if valid_links:
	scraped_content = {link: scrape_page(link) for link in valid_links}
	full_content = "\n".join(scraped_content.values())
	detailed_info = generate_detailed_company_info(full_content)
	st.write(detailed_info)

	# Make the Scraped Content collapsible
	with st.expander("📜 View Scraped Content (Click to Expand)", expanded=False):
	for url, content in scraped_content.items():
	st.write(f"### {url}")
	st.write(content)

	if __name__ == "__main__":
	main()