Spaces:

ngiometti
/

Midterm

Sleeping

App Files Files Community

Midterm / app.py

ngiometti

fixing everything

9db0a76 12 days ago

raw

history blame contribute delete

30.1 kB

	from __future__ import annotations
	import os
	import chainlit as cl
	import pandas as pd
	from typing import List, Dict, Any, TypedDict, Callable, Annotated, Literal, Optional, Union, Tuple, TypeVar
	from qdrant_client import QdrantClient
	from qdrant_client.models import Distance, VectorParams, PointStruct
	from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings
	from langgraph.graph import StateGraph, END
	from langchain.tools import Tool
	from tavily import TavilyClient
	from dotenv import load_dotenv
	import json
	import asyncio
	import time
	from functools import wraps
	from pydantic import BaseModel, Field
	from langchain_core.runnables import RunnableConfig
	from langchain_core.runnables.utils import Output
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
	from dataclasses import dataclass, field
	from state import FounderAnalysisState

	# Load environment variables
	load_dotenv()

	# Validate API keys
	if not os.getenv("OPENAI_API_KEY"):
	raise ValueError("OPENAI_API_KEY not found in environment variables. Please add it to your .env file.")
	if not os.getenv("TAVILY_API_KEY"):
	raise ValueError("TAVILY_API_KEY not found in environment variables. Please add it to your .env file.")

	# Configuration
	COLLECTION_NAME = "founders"
	VECTOR_DIM = 1536 # OpenAI embedding dimension
	EMBEDDING_MODEL = "text-embedding-3-small"
	LLM_MODEL = "gpt-4o-mini"
	MAX_RELEVANT_CHUNKS = 3
	SIMILARITY_THRESHOLD = 0.75
	DEFAULT_TIMEOUT = 60 # Default timeout in seconds
	API_RATE_LIMIT_DELAY = 1 # Delay between API calls in seconds

	StateType = TypeVar("StateType", bound=Dict[str, Any])

	# Decorator for adding timeouts to async functions
	def async_timeout(timeout_seconds=DEFAULT_TIMEOUT):
	def decorator(func):
	@wraps(func)
	async def wrapper(args, *kwargs):
	try:
	return await asyncio.wait_for(func(args, *kwargs), timeout=timeout_seconds)
	except asyncio.TimeoutError:
	# Create a meaningful timeout message
	func_name = func.__name__
	await cl.Message(content=f"⏱️ Operation timed out: {func_name} took longer than {timeout_seconds} seconds").send()
	# Return appropriate error state if the function was expecting to return a state
	if "state" in kwargs:
	return {**kwargs["state"], "error": f"Operation timed out after {timeout_seconds} seconds"}
	raise
	return wrapper
	return decorator

	# Rate limiter for API calls
	async def rate_limit():
	"""Simple rate limiter to prevent API throttling"""
	await asyncio.sleep(API_RATE_LIMIT_DELAY)

	class VectorStore:
	def __init__(self):
	self.client = QdrantClient(":memory:") # In-memory Qdrant instance
	self._create_collection()

	def _create_collection(self):
	"""Create the founders collection if it doesn't exist."""
	self.client.recreate_collection(
	collection_name=COLLECTION_NAME,
	vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE)
	)

	def upsert_profiles(self, embeddings: List[List[float]], metadata: List[Dict[str, Any]]):
	"""Upsert founder profiles with their embeddings and metadata."""
	points = [
	PointStruct(
	id=idx,
	vector=embedding,
	payload=metadata[idx]
	)
	for idx, embedding in enumerate(embeddings)
	]
	self.client.upsert(
	collection_name=COLLECTION_NAME,
	points=points
	)

	def search_profiles(self, query_vector: List[float], limit: int = 5) -> List[Dict[str, Any]]:
	"""Search for similar profiles using the query vector."""
	results = self.client.search(
	collection_name=COLLECTION_NAME,
	query_vector=query_vector,
	limit=limit
	)
	return [hit.payload for hit in results]

	def get_profile_by_metadata(self, metadata_key: str, metadata_value: Any) -> List[Dict[str, Any]]:
	"""Retrieve profiles based on metadata filtering."""
	from qdrant_client.http import models as rest

	filter_condition = rest.Filter(
	must=[
	rest.FieldCondition(
	key=metadata_key,
	match=rest.MatchValue(value=metadata_value)
	)
	]
	)

	results = self.client.scroll(
	collection_name=COLLECTION_NAME,
	scroll_filter=filter_condition
	)[0]

	return [point.payload for point in results]

	class FounderAnalysisSystem:
	def __init__(self):
	self.embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
	self.vector_store = VectorStore()
	self.llm = ChatOpenAI(model=LLM_MODEL, timeout=DEFAULT_TIMEOUT)
	self.tavily_client = TavilyClient()
	self.workflow = self._create_workflow()
	self.progress_message = None

	def _create_workflow(self) -> StateGraph:
	"""Create the LangGraph workflow for founder analysis."""
	# Use a simple dict type for the state graph
	workflow = StateGraph(dict)

	# Add nodes to the graph
	workflow.add_node("process_query", self.process_query)
	workflow.add_node("vector_search", self.vector_search)
	workflow.add_node("filter_by_metadata", self.filter_by_metadata)
	workflow.add_node("web_search", self.web_search)
	workflow.add_node("analyze_profiles", self.analyze_profiles)
	workflow.add_node("format_response", self.format_response)

	# Add conditional edges
	workflow.add_conditional_edges(
	"process_query",
	self.query_router,
	{
	"search": "vector_search",
	"filter": "filter_by_metadata",
	"error": END
	}
	)

	# Add standard edges
	workflow.add_edge("vector_search", "web_search")
	workflow.add_edge("filter_by_metadata", "web_search")
	workflow.add_edge("web_search", "analyze_profiles")
	workflow.add_edge("analyze_profiles", "format_response")
	workflow.add_edge("format_response", END)

	# Set entry point
	workflow.set_entry_point("process_query")

	return workflow

	async def update_progress(self, message, step, total_steps):
	"""Update the progress message to show the system is still working"""
	progress_text = f"⏳ {message} (Step {step}/{total_steps})"
	if self.progress_message is None:
	self.progress_message = cl.Message(content=progress_text)
	await self.progress_message.send()
	else:
	# Fix: Use update() without content parameter, then set content property
	await self.progress_message.update()
	self.progress_message.content = progress_text

	@async_timeout(30) # 30 second timeout for query processing
	async def process_query(self, state: FounderAnalysisState) -> FounderAnalysisState:
	"""Process the user query and determine the query type."""
	# Initialize state if needed
	if not isinstance(state, dict):
	state = {}

	state.update({
	"query": state.get("query", ""),
	"query_type": "",
	"filter_key": "",
	"filter_value": "",
	"retrieved_profiles": [],
	"web_search_results": [],
	"analysis_results": [],
	"final_response": {},
	"error": ""
	})

	query = state["query"]

	# Log the processing step
	await self.update_progress("Processing your query...", 1, 5)

	# Check if it's a filter command
	if query.lower().startswith("filter:") or query.lower().startswith("filter "):
	# Remove the filter prefix and trim whitespace
	filter_text = query.replace("filter:", "").replace("filter ", "").strip()

	# Check if there's a colon separator for key:value format
	if ":" in filter_text:
	parts = filter_text.split(":", 1)
	filter_key, filter_value = parts

	# Provide a helpful message if the filter value is empty
	if not filter_value.strip():
	return {
	**state,
	"error": f"Please provide a value to filter by. Example: filter:{filter_key}:value"
	}

	return {
	**state,
	"query_type": "filter",
	"filter_key": filter_key.strip(),
	"filter_value": filter_value.strip()
	}
	else:
	# If no specific key is provided, search across all fields
	filter_value = filter_text

	# Provide a helpful message if the filter value is empty
	if not filter_value.strip():
	return {
	**state,
	"error": "Please provide a value to filter by. Example: filter:Location:San Francisco"
	}

	return {
	**state,
	"query_type": "filter",
	"filter_key": "all_fields", # Special value to indicate searching across all fields
	"filter_value": filter_value.strip()
	}
	else:
	return {**state, "query_type": "search"}

	def query_router(self, state: FounderAnalysisState) -> str:
	"""Route to the appropriate node based on query type."""
	if "error" in state and state["error"]:
	return "error"
	return state["query_type"]

	@async_timeout(45) # 45 second timeout for vector search
	async def vector_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
	"""Search for similar profiles using vector similarity."""
	query = state["query"]

	# Log the vector search step
	await self.update_progress("Searching for relevant founder profiles...", 2, 5)

	try:
	# Convert query to embedding
	query_embedding = self.embeddings.embed_query(query)

	# Search for similar profiles
	profiles = self.vector_store.search_profiles(query_embedding, limit=3)

	if not profiles:
	return {
	**state,
	"retrieved_profiles": [],
	"error": "No matching profiles found."
	}

	return {**state, "retrieved_profiles": profiles}
	except Exception as e:
	return {**state, "error": f"Error during vector search: {str(e)}"}

	@async_timeout(45) # 45 second timeout for metadata filtering
	async def filter_by_metadata(self, state: FounderAnalysisState) -> FounderAnalysisState:
	"""Filter profiles by metadata."""
	filter_key = state["filter_key"]
	filter_value = state["filter_value"]

	# Log the filtering step
	if filter_key == "all_fields":
	await self.update_progress(f"Searching for '{filter_value}' across all profile fields...", 2, 5)
	else:
	await self.update_progress(f"Filtering profiles by {filter_key}: '{filter_value}'...", 2, 5)

	try:
	# Get all profiles first
	from qdrant_client.http import models as rest

	# Get all profiles from the collection
	results = self.vector_store.client.scroll(
	collection_name=COLLECTION_NAME,
	limit=100 # Adjust this limit based on your expected dataset size
	)[0]

	all_profiles = [point.payload for point in results]
	search_value = filter_value.lower()

	# Perform flexible filtering in Python
	filtered_profiles = []

	# Special case for searching across all fields
	if filter_key == "all_fields":
	for profile in all_profiles:
	# Search across all fields in the profile
	for key, value in profile.items():
	if value and search_value in str(value).lower():
	filtered_profiles.append(profile)
	break # Found a match, move to next profile
	else:
	# Regular field-specific search
	for profile in all_profiles:
	# Check if the key exists in the profile
	if filter_key in profile:
	profile_value = str(profile[filter_key]).lower()

	# Check for partial match (case-insensitive)
	if search_value in profile_value:
	filtered_profiles.append(profile)

	if not filtered_profiles:
	if filter_key == "all_fields":
	error_msg = f"No profiles found matching '{filter_value}' in any field"
	else:
	error_msg = f"No profiles found matching '{filter_value}' in {filter_key} field"

	return {
	**state,
	"retrieved_profiles": [],
	"error": error_msg
	}

	return {**state, "retrieved_profiles": filtered_profiles[:3]} # Limit to 3 profiles
	except Exception as e:
	return {**state, "error": f"Error during metadata filtering: {str(e)}"}

	@async_timeout(90) # 90 second timeout for web search
	async def web_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
	"""Gather additional information from web search."""
	profiles = state["retrieved_profiles"]

	if not profiles:
	return {**state, "web_search_results": []}

	await self.update_progress("Gathering additional information from web search...", 3, 5)

	web_search_results = []

	for i, profile in enumerate(profiles):
	name = profile.get("Full Name", "")
	position = profile.get("Current Position", "")
	company = profile.get("Company", "")

	# Update progress for each profile
	await self.update_progress(f"Searching web for info about {name} ({i+1}/{len(profiles)})...", 3, 5)

	search_query = f"{name} {position} {company}"
	try:
	results = self.tavily_client.search(
	query=search_query,
	search_depth="advanced"
	).get("results", [])

	web_search_results.append({
	"profile_name": name,
	"search_results": results
	})

	# Rate limit between API calls
	if i < len(profiles) - 1:
	await rate_limit()

	except Exception as e:
	await cl.Message(content=f"⚠️ Error searching for {name}: {str(e)}").send()

	return {**state, "web_search_results": web_search_results}

	@async_timeout(120) # 2 minute timeout for analysis
	async def analyze_profiles(self, state: FounderAnalysisState) -> FounderAnalysisState:
	"""Analyze profiles with additional context."""
	profiles = state["retrieved_profiles"]
	web_results = state["web_search_results"]

	if not profiles:
	return {**state, "analysis_results": []}

	await self.update_progress("Analyzing profiles and generating recommendations...", 4, 5)

	analysis_results = []

	for i, profile in enumerate(profiles):
	name = profile.get("Full Name", "")

	# Find matching web results
	additional_info = []
	for result in web_results:
	if result["profile_name"] == name:
	additional_info = result["search_results"]
	break

	# Update progress for each profile
	await self.update_progress(f"Analyzing profile for {name} ({i+1}/{len(profiles)})...", 4, 5)

	# Extract social media and online presence
	linkedin = profile.get("LinkedIn", "")
	twitter = profile.get("Twitter", "")
	website = profile.get("Website", "")

	analysis_prompt = f"""
	Based on the following founder profile and additional information, analyze what types of companies
	this person would be best suited to found. Consider their experience, skills, background, and online presence.

	Profile: {json.dumps(profile, indent=2)}
	Additional Information: {json.dumps(additional_info, indent=2)}

	Provide a detailed analysis including:
	1. Recommended industry sectors based on their expertise and background
	2. Type of company (B2B, B2C, etc.) that would align with their experience
	3. Key strengths that would contribute to success as a founder
	4. Potential challenges to consider based on their profile
	5. How their network and online presence could benefit their venture
	6. Specific opportunities or niches they might be well-positioned to address

	Be specific and provide actionable insights based on the information available.
	"""

	try:
	response = self.llm.invoke([HumanMessage(content=analysis_prompt)])

	analysis_results.append({
	"founder_name": name,
	"analysis": response.content,
	"profile": profile,
	"additional_info": additional_info
	})

	# Rate limit between API calls
	if i < len(profiles) - 1:
	await rate_limit()

	except Exception as e:
	await cl.Message(content=f"⚠️ Error analyzing {name}: {str(e)}").send()

	return {**state, "analysis_results": analysis_results}

	@async_timeout(30) # 30 second timeout for formatting
	async def format_response(self, state: FounderAnalysisState) -> FounderAnalysisState:
	"""Format the final response for display."""
	analysis_results = state["analysis_results"]

	await self.update_progress("Formatting final results...", 5, 5)

	# Clear the progress message
	self.progress_message = None

	if not analysis_results:
	if "error" in state and state["error"]:
	await cl.Message(content=f"❌ {state['error']}").send()
	else:
	await cl.Message(content="❌ No results to display.").send()
	return {**state, "final_response": {"status": "error", "message": state.get("error", "No results")}}

	for result in analysis_results:
	founder_name = result["founder_name"]
	profile = result["profile"]
	analysis = result["analysis"]

	# Build profile summary with basic information
	profile_summary = f"""
	🎯 Profile Summary:

	- Name: {profile.get('Full Name', '')}
	- Current Position: {profile.get('Current Position', '')}
	- Company: {profile.get('Company', '')}
	- Location: {profile.get('Location', '')}
	"""

	# Add LinkedIn profile with proper URL formatting
	if profile.get('LinkedIn') and profile.get('LinkedIn').strip():
	linkedin_url = profile.get('LinkedIn')
	# Make sure the URL has the proper format
	if not linkedin_url.startswith('http'):
	linkedin_url = f"https://{linkedin_url}"
	profile_summary += f"- LinkedIn: {linkedin_url}\n"

	# Add any other social profiles or websites
	if profile.get('Twitter') and profile.get('Twitter').strip():
	twitter_url = profile.get('Twitter')
	if not twitter_url.startswith('http'):
	twitter_url = f"https://{twitter_url}"
	profile_summary += f"- Twitter: {twitter_url}\n"

	if profile.get('Website') and profile.get('Website').strip():
	website_url = profile.get('Website')
	if not website_url.startswith('http'):
	website_url = f"https://{website_url}"
	profile_summary += f"- Website: {website_url}\n"

	# Format the analysis
	analysis_text = f"""
	📊 Analysis:

	{analysis}
	"""

	# Create elements for structured display using Text instead of Markdown
	elements = [
	cl.Text(content=profile_summary),
	cl.Text(content=analysis_text)
	]

	await cl.Message(
	content=f"Analysis for {founder_name}:",
	elements=elements
	).send()

	await cl.Message(content="✅ Analysis complete!").send()

	return {**state, "final_response": {"status": "success", "results": analysis_results}}

	@async_timeout(120) # 2 minute timeout for loading profiles
	async def load_profiles(self, file):
	"""Load and embed founder profiles from uploaded CSV."""
	# Read CSV file
	df = pd.read_csv(file)

	# Convert DataFrame rows to list of dictionaries
	profiles = df.to_dict('records')

	# Create more comprehensive text representations for embedding
	texts = []
	for p in profiles:
	# Build a rich text representation including all available fields
	text_parts = []

	# Add core identity information
	if p.get('Full Name'):
	text_parts.append(f"Name: {p.get('Full Name')}")

	if p.get('Current Position'):
	text_parts.append(f"Position: {p.get('Current Position')}")

	if p.get('Company'):
	text_parts.append(f"Company: {p.get('Company')}")

	if p.get('Location'):
	text_parts.append(f"Location: {p.get('Location')}")

	# Add contact and social media information
	if p.get('LinkedIn'):
	text_parts.append(f"LinkedIn: {p.get('LinkedIn')}")

	if p.get('Twitter'):
	text_parts.append(f"Twitter: {p.get('Twitter')}")

	if p.get('Website'):
	text_parts.append(f"Website: {p.get('Website')}")

	if p.get('Email'):
	text_parts.append(f"Email: {p.get('Email')}")

	# Add detailed professional information
	if p.get('About'):
	text_parts.append(f"About: {p.get('About')}")

	if p.get('Skills'):
	text_parts.append(f"Skills: {p.get('Skills')}")

	if p.get('Experience'):
	text_parts.append(f"Experience: {p.get('Experience')}")

	if p.get('Education'):
	text_parts.append(f"Education: {p.get('Education')}")

	# Add any industry or sector information
	if p.get('Industry'):
	text_parts.append(f"Industry: {p.get('Industry')}")

	if p.get('Sector'):
	text_parts.append(f"Sector: {p.get('Sector')}")

	# Add any entrepreneurial information
	if p.get('Previous Startups'):
	text_parts.append(f"Previous Startups: {p.get('Previous Startups')}")

	if p.get('Funding History'):
	text_parts.append(f"Funding History: {p.get('Funding History')}")

	# Add any additional fields that might be in the CSV
	for key, value in p.items():
	if (key not in ['Full Name', 'Current Position', 'Company', 'Location',
	'LinkedIn', 'Twitter', 'Website', 'Email',
	'About', 'Skills', 'Experience', 'Education',
	'Industry', 'Sector', 'Previous Startups', 'Funding History']
	and value and str(value).lower() != 'nan'):
	text_parts.append(f"{key}: {value}")

	# Join all parts with newlines for better separation
	text = "\n".join(text_parts)
	texts.append(text)

	# Log the first few profiles to help with debugging
	if len(texts) <= 3:
	print(f"Profile {len(texts)} text representation:\n{text}\n")

	# Generate embeddings
	embeddings = self.embeddings.embed_documents(texts)

	# Store in vector database
	self.vector_store.upsert_profiles(embeddings, profiles)

	return len(profiles)

	@async_timeout(300) # 5 minute overall timeout for the entire process
	async def process_message(self, query: str):
	"""Process a user message through the workflow."""
	# Reset progress message
	self.progress_message = None

	# Initialize the state as a simple dictionary
	state = {
	"query": query,
	"query_type": "",
	"filter_key": "",
	"filter_value": "",
	"retrieved_profiles": [],
	"web_search_results": [],
	"analysis_results": [],
	"final_response": {},
	"error": ""
	}

	try:
	# Manually execute the workflow nodes in sequence
	# First process the query
	state = await self.process_query(state)

	# Route based on query type
	next_node = self.query_router(state)

	if next_node == "error":
	await cl.Message(content=f"❌ {state['error']}").send()
	return

	# Execute the appropriate search method
	if next_node == "search":
	state = await self.vector_search(state)
	elif next_node == "filter":
	state = await self.filter_by_metadata(state)

	# Check for errors after search
	if state.get("error"):
	await cl.Message(content=f"❌ {state['error']}").send()
	return

	# Continue with the rest of the workflow
	state = await self.web_search(state)
	state = await self.analyze_profiles(state)
	state = await self.format_response(state)

	except asyncio.TimeoutError:
	await cl.Message(content="❌ The operation timed out. Please try a simpler query or try again later.").send()
	except Exception as e:
	await cl.Message(content=f"❌ Error processing request: {str(e)}").send()

	# Initialize the system
	system = FounderAnalysisSystem()

	@cl.on_chat_start
	async def start():
	"""Initialize the chat session and prompt for CSV upload."""
	await cl.Message(
	content="👋 Welcome to the Founder Analysis System! Please upload your CSV file with founder profiles."
	).send()

	files = await cl.AskFileMessage(
	content="Please upload your CSV file",
	accept=["text/csv"],
	max_size_mb=10
	).send()

	if not files:
	await cl.Message(
	content="No file was uploaded. Please try again."
	).send()
	return

	file = files[0]

	# Show loading message
	msg = cl.Message(content=f"⏳ Processing {file.name}...")
	await msg.send()

	try:
	# Load the profiles with timeout
	num_profiles = await asyncio.wait_for(system.load_profiles(file.path), timeout=120)

	await cl.Message(
	content=f"✅ Successfully loaded {num_profiles} founder profiles!\n\n" +
	"You can now:\n\n" +
	"1. Search for founders by expertise:\n" +
	" Example: `AI experts in healthcare`\n\n" +
	"2. Filter by specific fields:\n" +
	" Example: `filter:Location:San Francisco`\n" +
	" Example: `filter:Skills:Machine Learning`\n\n" +
	"3. Search across all fields:\n" +
	" Example: `filter:Stanford`\n" +
	" Example: `filter blockchain`\n\n" +
	"4. Get founder recommendations:\n" +
	" Example: `recommend founders for fintech startup`"
	).send()
	except asyncio.TimeoutError:
	await cl.Message(content="❌ Loading profiles timed out. The CSV file might be too large or complex.").send()
	except Exception as e:
	await cl.Message(content=f"❌ Error loading profiles: {str(e)}").send()

	@cl.on_message
	async def main(message: cl.Message):
	"""Handle user messages and provide responses."""
	await system.process_message(message.content)