Spaces:
Sleeping
Sleeping
import re | |
from transformers import pipeline | |
from bs4 import BeautifulSoup | |
import requests | |
import openai | |
# Initialize summarizer and NER pipeline | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER") | |
# Set up OpenAI API for nuanced text processing (GPT-4) | |
openai.api_key = "YOUR_OPENAI_API_KEY" | |
def fetch_company_pages(base_url): | |
# Define additional pages to search for mission or company info | |
pages = [base_url, f"{base_url}/about", f"{base_url}/company", f"{base_url}/about-us"] | |
content = "" | |
for url in pages: | |
try: | |
response = requests.get(url, timeout=5) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Get main descriptive content from meta or visible text | |
meta_desc = soup.find('meta', {'name': 'description'}) | |
main_text = " ".join(p.get_text() for p in soup.find_all('p')) if not meta_desc else meta_desc['content'] | |
content += main_text | |
# Stop if we find substantial content | |
if len(content) > 300: | |
break | |
except Exception as e: | |
print(f"Error fetching from {url}: {e}") | |
return content or "Company information not found." | |
def classify_industry(description): | |
# Use OpenAI to classify industry based on description | |
prompt = f"Classify the following company description into an industry category (e.g., Finance, Retail, Technology, Healthcare, Automotive):\n\n{description}\n\nIndustry:" | |
response = openai.Completion.create(model="gpt-4", prompt=prompt, max_tokens=10) | |
return response.choices[0].text.strip() | |
def detect_focus_areas(text): | |
# Identify focus areas based on recognized entities | |
entities = ner_pipeline(text) | |
focus_areas = set() | |
# Define common focus area patterns | |
area_patterns = { | |
"customer experience": ["customer", "experience", "support", "satisfaction"], | |
"operations": ["operations", "supply chain", "efficiency", "logistics", "processes"], | |
"product innovation": ["innovation", "product", "development", "R&D"], | |
"sustainability": ["sustainability", "environment", "eco", "carbon"] | |
} | |
# Check for focus area matches | |
for entity in entities: | |
for area, keywords in area_patterns.items(): | |
if any(keyword in entity['word'].lower() for keyword in keywords): | |
focus_areas.add(area) | |
return list(focus_areas) | |
def summarize_mission(text): | |
# Generate a summary for mission content | |
mission_summary = summarizer(text, max_length=30, min_length=10, do_sample=False)[0]['summary_text'] | |
return mission_summary | |
def get_company_info(url): | |
try: | |
# Step 1: Fetch and process company page content | |
content = fetch_company_pages(url) | |
# Step 2: Summarize mission content | |
mission = summarize_mission(content) | |
# Step 3: Classify the company industry using GPT | |
industry = classify_industry(content) | |
# Step 4: Detect specific focus areas | |
focus_areas = detect_focus_areas(content) | |
# Final dictionary with company information | |
return { | |
"mission": mission, | |
"industry": industry, | |
"focus_areas": focus_areas or ["No specific focus areas identified"] | |
} | |
except Exception as e: | |
print(f"Error fetching company info: {e}") | |
return { | |
"mission": "Mission not found", | |
"industry": "Unknown", | |
"focus_areas": [] | |
} | |