SmartCaseAI / industry_research.py
Hyma7's picture
Update industry_research.py
e64f3d0 verified
import re
from transformers import pipeline
from bs4 import BeautifulSoup
import requests
import openai
# Initialize summarizer and NER pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")
# Set up OpenAI API for nuanced text processing (GPT-4)
openai.api_key = "YOUR_OPENAI_API_KEY"
def fetch_company_pages(base_url):
# Define additional pages to search for mission or company info
pages = [base_url, f"{base_url}/about", f"{base_url}/company", f"{base_url}/about-us"]
content = ""
for url in pages:
try:
response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.text, 'html.parser')
# Get main descriptive content from meta or visible text
meta_desc = soup.find('meta', {'name': 'description'})
main_text = " ".join(p.get_text() for p in soup.find_all('p')) if not meta_desc else meta_desc['content']
content += main_text
# Stop if we find substantial content
if len(content) > 300:
break
except Exception as e:
print(f"Error fetching from {url}: {e}")
return content or "Company information not found."
def classify_industry(description):
# Use OpenAI to classify industry based on description
prompt = f"Classify the following company description into an industry category (e.g., Finance, Retail, Technology, Healthcare, Automotive):\n\n{description}\n\nIndustry:"
response = openai.Completion.create(model="gpt-4", prompt=prompt, max_tokens=10)
return response.choices[0].text.strip()
def detect_focus_areas(text):
# Identify focus areas based on recognized entities
entities = ner_pipeline(text)
focus_areas = set()
# Define common focus area patterns
area_patterns = {
"customer experience": ["customer", "experience", "support", "satisfaction"],
"operations": ["operations", "supply chain", "efficiency", "logistics", "processes"],
"product innovation": ["innovation", "product", "development", "R&D"],
"sustainability": ["sustainability", "environment", "eco", "carbon"]
}
# Check for focus area matches
for entity in entities:
for area, keywords in area_patterns.items():
if any(keyword in entity['word'].lower() for keyword in keywords):
focus_areas.add(area)
return list(focus_areas)
def summarize_mission(text):
# Generate a summary for mission content
mission_summary = summarizer(text, max_length=30, min_length=10, do_sample=False)[0]['summary_text']
return mission_summary
def get_company_info(url):
try:
# Step 1: Fetch and process company page content
content = fetch_company_pages(url)
# Step 2: Summarize mission content
mission = summarize_mission(content)
# Step 3: Classify the company industry using GPT
industry = classify_industry(content)
# Step 4: Detect specific focus areas
focus_areas = detect_focus_areas(content)
# Final dictionary with company information
return {
"mission": mission,
"industry": industry,
"focus_areas": focus_areas or ["No specific focus areas identified"]
}
except Exception as e:
print(f"Error fetching company info: {e}")
return {
"mission": "Mission not found",
"industry": "Unknown",
"focus_areas": []
}