File size: 8,678 Bytes
683bb78
d5c6a45
683bb78
d5c6a45
683bb78
 
 
d5c6a45
683bb78
 
 
d5c6a45
 
 
683bb78
d5c6a45
683bb78
 
d5c6a45
683bb78
 
 
 
 
d5c6a45
 
 
 
 
 
683bb78
d5c6a45
 
 
17250b8
 
 
d5c6a45
 
17250b8
683bb78
d5c6a45
683bb78
d5c6a45
683bb78
 
 
 
 
 
d5c6a45
 
683bb78
 
d5c6a45
 
 
 
 
683bb78
 
17250b8
d5c6a45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17250b8
683bb78
 
17250b8
683bb78
d5c6a45
683bb78
d5c6a45
683bb78
 
 
d5c6a45
 
 
 
 
 
683bb78
d5c6a45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17250b8
d5c6a45
 
 
 
17250b8
d5c6a45
 
17250b8
d5c6a45
 
 
 
 
17250b8
683bb78
d5c6a45
683bb78
 
 
 
 
 
d5c6a45
683bb78
 
 
 
d5c6a45
683bb78
 
 
 
 
 
17250b8
683bb78
d5c6a45
683bb78
 
9fa8d4f
 
 
683bb78
 
9fa8d4f
 
683bb78
9fa8d4f
 
 
683bb78
 
9fa8d4f
d5c6a45
683bb78
9fa8d4f
 
 
683bb78
9fa8d4f
683bb78
 
9fa8d4f
683bb78
9fa8d4f
 
683bb78
9fa8d4f
 
683bb78
9fa8d4f
683bb78
 
 
 
 
d5c6a45
683bb78
d5c6a45
683bb78
 
9fa8d4f
683bb78
9fa8d4f
683bb78
 
 
 
 
9fa8d4f
683bb78
9fa8d4f
683bb78
 
17250b8
683bb78
9fa8d4f
 
683bb78
d5c6a45
683bb78
d5c6a45
683bb78
 
 
d5c6a45
 
683bb78
 
d5c6a45
683bb78
 
d5c6a45
683bb78
 
 
 
d3a901d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import os
import time
import requests
import cloudscraper
import streamlit as st
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from selenium import webdriver
from langchain_groq import ChatGroq
from urllib.parse import urljoin, urlparse
from langchain_core.prompts import ChatPromptTemplate
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Load API Key
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    st.error("Error: Groq API Key is missing. Please set 'GROQ_API_KEY' as an environment variable.")

chat = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama-3.3-70b-versatile")

# Model Token Limits
MODEL_TOKEN_LIMIT = 32500
CHUNK_SIZE = 15000

# Initialize Cloudscraper
scraper = cloudscraper.create_scraper()

# Headers to mimic real browser requests
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

# βœ… **Extract Links**
def get_valid_links(base_url):
    """Extracts all internal links, including footer and JavaScript-rendered links."""
    try:
        response = scraper.get(base_url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        domain = urlparse(base_url).netloc

        links = set()
        for link in soup.find_all('a', href=True):
            full_url = urljoin(base_url, link.get('href'))
            if domain in urlparse(full_url).netloc:
                links.add(full_url)

        # If few links are found, fallback to Selenium
        if len(links) < 5 or not check_footer_links(soup):
            selenium_links = get_links_with_selenium(base_url)
            links.update(selenium_links)

        return links
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching links: {e}")
        return set()

def check_footer_links(soup):
    """Checks if footer links exist."""
    footer = soup.find("footer")
    return footer and footer.find_all("a", href=True)

def get_links_with_selenium(url):
    """Extracts JavaScript-rendered links using Selenium."""
    try:
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(5)  # Allow JavaScript to load

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()

        links = set()
        domain = urlparse(url).netloc

        for link in soup.find_all('a', href=True):
            full_url = urljoin(url, link.get('href'))
            if domain in urlparse(full_url).netloc:
                links.add(full_url)

        return links
    except Exception as e:
        print(f"❌ Selenium Error: {e}")
        return set()

# βœ… **Scrape Pages**
def scrape_page(url):
    """Scrapes a webpage, using Selenium if necessary."""
    try:
        response = scraper.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        if len(soup.get_text(strip=True)) < 500:
            return scrape_with_selenium(url)

        return extract_text(soup)
    except requests.exceptions.RequestException:
        return scrape_with_selenium(url)

def scrape_with_selenium(url):
    """Scrapes JavaScript-heavy pages using Selenium."""
    try:
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(5)  # Allow JavaScript to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()

        return extract_text(soup)
    except Exception as e:
        return f"❌ Selenium Scraping Error: {e}"
    
def extract_text(soup):
    """Extracts **all** meaningful text from HTML content, including dynamic elements."""
    
    # βœ… Extracts all text from the HTML, not just specific tags
    all_text = soup.get_text(separator="\n", strip=True)
    
    # βœ… Removes duplicate lines & unwanted spaces
    unique_lines = set(all_text.split("\n"))
    cleaned_text = "\n".join(line for line in unique_lines if len(line) > 3)  # Exclude tiny fragments
    
    return cleaned_text

# βœ… **Chunking for Large AI Requests**
def split_into_chunks(text, chunk_size):
    """Splits long content into manageable chunks for AI processing."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += len(word) + 1

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# βœ… **AI-Powered Company Breakdown**
def generate_detailed_company_info(company_data):
    """Generates an in-depth company breakdown with AI."""
    
    system_prompt = """
    You are a business research AI. Provide **detailed** insights strictly from the extracted company data.
    - Do **not** infer missing details.
    - If data is missing, label it as **"Data Not Available"**.
    """

    user_prompt_template = f"""
    Based on the extracted content, **generate a structured company analysis**:

    ## **Company Overview**
    - Full company name, industry, and key differentiators.
    - Headquarters location & founding year (if available).

    ## **Mission & Vision**
    - Clearly state the company's mission and vision.
    - If missing, state **"Data Not Available"**.

    ## **Products & Services**
    - List major products/services and their benefits.
    
    ## **Target Audience**
    - Define customer demographics or industries served.

    ## **Business Model & Revenue Streams**
    - Describe revenue model (e.g., SaaS, B2B, freemium).

    ## **Competitive Edge & Market Position**
    - Highlight unique features, patents, and innovations.

    ## **Clients & Industry Impact**
    - Notable clients, case studies, or market influence.

    **Extracted Data:**
    {company_data}
    """

    responses = []
    if len(company_data) > CHUNK_SIZE:
        st.warning("πŸ”„ Large content detected! Splitting into multiple AI requests.")
        chunks = split_into_chunks(company_data, CHUNK_SIZE)

        for i, chunk in enumerate(chunks):
            st.write(f"Processing AI Response {i+1}/{len(chunks)}...")
            prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
            chain = prompt | chat
            response = chain.invoke({"text": user_prompt_template.format(company_data=chunk)})
            responses.append(response.content)

        return "\n\n".join(responses)

    else:
        prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", user_prompt_template)])
        chain = prompt | chat
        response = chain.invoke({"text": user_prompt_template})
        return response.content

# βœ… **Streamlit UI**
def main():
    st.title("πŸš€ AI-Powered Company Website Scraper")
    base_url = st.text_input("πŸ”— Enter Website URL", "")

    if st.button("Scrape"):
        if base_url:
            st.write(f"πŸ” Scraping: {base_url}... Please wait.")
            valid_links = get_valid_links(base_url)

            if valid_links:
                scraped_content = {link: scrape_page(link) for link in valid_links}
                full_content = "\n".join(scraped_content.values())
                detailed_info = generate_detailed_company_info(full_content)
                st.write(detailed_info)
                
                # Make the Scraped Content collapsible
                with st.expander("πŸ“œ **View Scraped Content** (Click to Expand)", expanded=False):
                    for url, content in scraped_content.items():
                        st.write(f"### {url}")
                        st.write(content)

if __name__ == "__main__":
    main()