Spaces:

ngiometti
/

Midterm

Sleeping

File size: 30,086 Bytes

from __future__ import annotations
import os
import chainlit as cl
import pandas as pd
from typing import List, Dict, Any, TypedDict, Callable, Annotated, Literal, Optional, Union, Tuple, TypeVar
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langgraph.graph import StateGraph, END
from langchain.tools import Tool
from tavily import TavilyClient
from dotenv import load_dotenv
import json
import asyncio
import time
from functools import wraps
from pydantic import BaseModel, Field
from langchain_core.runnables import RunnableConfig
from langchain_core.runnables.utils import Output
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from dataclasses import dataclass, field
from state import FounderAnalysisState

# Load environment variables
load_dotenv()

# Validate API keys
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in environment variables. Please add it to your .env file.")
if not os.getenv("TAVILY_API_KEY"):
    raise ValueError("TAVILY_API_KEY not found in environment variables. Please add it to your .env file.")

# Configuration
COLLECTION_NAME = "founders"
VECTOR_DIM = 1536  # OpenAI embedding dimension
EMBEDDING_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-4o-mini"
MAX_RELEVANT_CHUNKS = 3
SIMILARITY_THRESHOLD = 0.75
DEFAULT_TIMEOUT = 60  # Default timeout in seconds
API_RATE_LIMIT_DELAY = 1  # Delay between API calls in seconds

StateType = TypeVar("StateType", bound=Dict[str, Any])

# Decorator for adding timeouts to async functions
def async_timeout(timeout_seconds=DEFAULT_TIMEOUT):
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            try:
                return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_seconds)
            except asyncio.TimeoutError:
                # Create a meaningful timeout message
                func_name = func.__name__
                await cl.Message(content=f"⏱️ Operation timed out: {func_name} took longer than {timeout_seconds} seconds").send()
                # Return appropriate error state if the function was expecting to return a state
                if "state" in kwargs:
                    return {**kwargs["state"], "error": f"Operation timed out after {timeout_seconds} seconds"}
                raise
        return wrapper
    return decorator

# Rate limiter for API calls
async def rate_limit():
    """Simple rate limiter to prevent API throttling"""
    await asyncio.sleep(API_RATE_LIMIT_DELAY)

class VectorStore:
    def __init__(self):
        self.client = QdrantClient(":memory:")  # In-memory Qdrant instance
        self._create_collection()

    def _create_collection(self):
        """Create the founders collection if it doesn't exist."""
        self.client.recreate_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE)
        )

    def upsert_profiles(self, embeddings: List[List[float]], metadata: List[Dict[str, Any]]):
        """Upsert founder profiles with their embeddings and metadata."""
        points = [
            PointStruct(
                id=idx,
                vector=embedding,
                payload=metadata[idx]
            )
            for idx, embedding in enumerate(embeddings)
        ]
        self.client.upsert(
            collection_name=COLLECTION_NAME,
            points=points
        )

    def search_profiles(self, query_vector: List[float], limit: int = 5) -> List[Dict[str, Any]]:
        """Search for similar profiles using the query vector."""
        results = self.client.search(
            collection_name=COLLECTION_NAME,
            query_vector=query_vector,
            limit=limit
        )
        return [hit.payload for hit in results]

    def get_profile_by_metadata(self, metadata_key: str, metadata_value: Any) -> List[Dict[str, Any]]:
        """Retrieve profiles based on metadata filtering."""
        from qdrant_client.http import models as rest
        
        filter_condition = rest.Filter(
            must=[
                rest.FieldCondition(
                    key=metadata_key,
                    match=rest.MatchValue(value=metadata_value)
                )
            ]
        )
        
        results = self.client.scroll(
            collection_name=COLLECTION_NAME,
            scroll_filter=filter_condition
        )[0]
        
        return [point.payload for point in results]

class FounderAnalysisSystem:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
        self.vector_store = VectorStore()
        self.llm = ChatOpenAI(model=LLM_MODEL, timeout=DEFAULT_TIMEOUT)
        self.tavily_client = TavilyClient()
        self.workflow = self._create_workflow()
        self.progress_message = None

    def _create_workflow(self) -> StateGraph:
        """Create the LangGraph workflow for founder analysis."""
        # Use a simple dict type for the state graph
        workflow = StateGraph(dict)
        
        # Add nodes to the graph
        workflow.add_node("process_query", self.process_query)
        workflow.add_node("vector_search", self.vector_search)
        workflow.add_node("filter_by_metadata", self.filter_by_metadata)
        workflow.add_node("web_search", self.web_search)
        workflow.add_node("analyze_profiles", self.analyze_profiles)
        workflow.add_node("format_response", self.format_response)
        
        # Add conditional edges
        workflow.add_conditional_edges(
            "process_query",
            self.query_router,
            {
                "search": "vector_search",
                "filter": "filter_by_metadata",
                "error": END
            }
        )
        
        # Add standard edges
        workflow.add_edge("vector_search", "web_search")
        workflow.add_edge("filter_by_metadata", "web_search")
        workflow.add_edge("web_search", "analyze_profiles")
        workflow.add_edge("analyze_profiles", "format_response")
        workflow.add_edge("format_response", END)
        
        # Set entry point
        workflow.set_entry_point("process_query")
        
        return workflow

    async def update_progress(self, message, step, total_steps):
        """Update the progress message to show the system is still working"""
        progress_text = f"⏳ {message} (Step {step}/{total_steps})"
        if self.progress_message is None:
            self.progress_message = cl.Message(content=progress_text)
            await self.progress_message.send()
        else:
            # Fix: Use update() without content parameter, then set content property
            await self.progress_message.update()
            self.progress_message.content = progress_text

    @async_timeout(30)  # 30 second timeout for query processing
    async def process_query(self, state: FounderAnalysisState) -> FounderAnalysisState:
        """Process the user query and determine the query type."""
        # Initialize state if needed
        if not isinstance(state, dict):
            state = {}
        
        state.update({
            "query": state.get("query", ""),
            "query_type": "",
            "filter_key": "",
            "filter_value": "",
            "retrieved_profiles": [],
            "web_search_results": [],
            "analysis_results": [],
            "final_response": {},
            "error": ""
        })
        
        query = state["query"]
        
        # Log the processing step
        await self.update_progress("Processing your query...", 1, 5)
        
        # Check if it's a filter command
        if query.lower().startswith("filter:") or query.lower().startswith("filter "):
            # Remove the filter prefix and trim whitespace
            filter_text = query.replace("filter:", "").replace("filter ", "").strip()
            
            # Check if there's a colon separator for key:value format
            if ":" in filter_text:
                parts = filter_text.split(":", 1)
                filter_key, filter_value = parts
                
                # Provide a helpful message if the filter value is empty
                if not filter_value.strip():
                    return {
                        **state,
                        "error": f"Please provide a value to filter by. Example: filter:{filter_key}:value"
                    }
                
                return {
                    **state,
                    "query_type": "filter",
                    "filter_key": filter_key.strip(),
                    "filter_value": filter_value.strip()
                }
            else:
                # If no specific key is provided, search across all fields
                filter_value = filter_text
                
                # Provide a helpful message if the filter value is empty
                if not filter_value.strip():
                    return {
                        **state,
                        "error": "Please provide a value to filter by. Example: filter:Location:San Francisco"
                    }
                
                return {
                    **state,
                    "query_type": "filter",
                    "filter_key": "all_fields",  # Special value to indicate searching across all fields
                    "filter_value": filter_value.strip()
                }
        else:
            return {**state, "query_type": "search"}

    def query_router(self, state: FounderAnalysisState) -> str:
        """Route to the appropriate node based on query type."""
        if "error" in state and state["error"]:
            return "error"
        return state["query_type"]

    @async_timeout(45)  # 45 second timeout for vector search
    async def vector_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
        """Search for similar profiles using vector similarity."""
        query = state["query"]
        
        # Log the vector search step
        await self.update_progress("Searching for relevant founder profiles...", 2, 5)
        
        try:
            # Convert query to embedding
            query_embedding = self.embeddings.embed_query(query)
            
            # Search for similar profiles
            profiles = self.vector_store.search_profiles(query_embedding, limit=3)
            
            if not profiles:
                return {
                    **state,
                    "retrieved_profiles": [],
                    "error": "No matching profiles found."
                }
            
            return {**state, "retrieved_profiles": profiles}
        except Exception as e:
            return {**state, "error": f"Error during vector search: {str(e)}"}

    @async_timeout(45)  # 45 second timeout for metadata filtering
    async def filter_by_metadata(self, state: FounderAnalysisState) -> FounderAnalysisState:
        """Filter profiles by metadata."""
        filter_key = state["filter_key"]
        filter_value = state["filter_value"]
        
        # Log the filtering step
        if filter_key == "all_fields":
            await self.update_progress(f"Searching for '{filter_value}' across all profile fields...", 2, 5)
        else:
            await self.update_progress(f"Filtering profiles by {filter_key}: '{filter_value}'...", 2, 5)
        
        try:
            # Get all profiles first
            from qdrant_client.http import models as rest
            
            # Get all profiles from the collection
            results = self.vector_store.client.scroll(
                collection_name=COLLECTION_NAME,
                limit=100  # Adjust this limit based on your expected dataset size
            )[0]
            
            all_profiles = [point.payload for point in results]
            search_value = filter_value.lower()
            
            # Perform flexible filtering in Python
            filtered_profiles = []
            
            # Special case for searching across all fields
            if filter_key == "all_fields":
                for profile in all_profiles:
                    # Search across all fields in the profile
                    for key, value in profile.items():
                        if value and search_value in str(value).lower():
                            filtered_profiles.append(profile)
                            break  # Found a match, move to next profile
            else:
                # Regular field-specific search
                for profile in all_profiles:
                    # Check if the key exists in the profile
                    if filter_key in profile:
                        profile_value = str(profile[filter_key]).lower()
                        
                        # Check for partial match (case-insensitive)
                        if search_value in profile_value:
                            filtered_profiles.append(profile)
            
            if not filtered_profiles:
                if filter_key == "all_fields":
                    error_msg = f"No profiles found matching '{filter_value}' in any field"
                else:
                    error_msg = f"No profiles found matching '{filter_value}' in {filter_key} field"
                
                return {
                    **state,
                    "retrieved_profiles": [],
                    "error": error_msg
                }
            
            return {**state, "retrieved_profiles": filtered_profiles[:3]}  # Limit to 3 profiles
        except Exception as e:
            return {**state, "error": f"Error during metadata filtering: {str(e)}"}

    @async_timeout(90)  # 90 second timeout for web search
    async def web_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
        """Gather additional information from web search."""
        profiles = state["retrieved_profiles"]
        
        if not profiles:
            return {**state, "web_search_results": []}
        
        await self.update_progress("Gathering additional information from web search...", 3, 5)
        
        web_search_results = []
        
        for i, profile in enumerate(profiles):
            name = profile.get("Full Name", "")
            position = profile.get("Current Position", "")
            company = profile.get("Company", "")
            
            # Update progress for each profile
            await self.update_progress(f"Searching web for info about {name} ({i+1}/{len(profiles)})...", 3, 5)
            
            search_query = f"{name} {position} {company}"
            try:
                results = self.tavily_client.search(
                    query=search_query, 
                    search_depth="advanced"
                ).get("results", [])
                
                web_search_results.append({
                    "profile_name": name,
                    "search_results": results
                })
                
                # Rate limit between API calls
                if i < len(profiles) - 1:
                    await rate_limit()
                
            except Exception as e:
                await cl.Message(content=f"⚠️ Error searching for {name}: {str(e)}").send()
        
        return {**state, "web_search_results": web_search_results}

    @async_timeout(120)  # 2 minute timeout for analysis
    async def analyze_profiles(self, state: FounderAnalysisState) -> FounderAnalysisState:
        """Analyze profiles with additional context."""
        profiles = state["retrieved_profiles"]
        web_results = state["web_search_results"]
        
        if not profiles:
            return {**state, "analysis_results": []}
        
        await self.update_progress("Analyzing profiles and generating recommendations...", 4, 5)
        
        analysis_results = []
        
        for i, profile in enumerate(profiles):
            name = profile.get("Full Name", "")
            
            # Find matching web results
            additional_info = []
            for result in web_results:
                if result["profile_name"] == name:
                    additional_info = result["search_results"]
                    break
            
            # Update progress for each profile
            await self.update_progress(f"Analyzing profile for {name} ({i+1}/{len(profiles)})...", 4, 5)
            
            # Extract social media and online presence
            linkedin = profile.get("LinkedIn", "")
            twitter = profile.get("Twitter", "")
            website = profile.get("Website", "")
            
            analysis_prompt = f"""
            Based on the following founder profile and additional information, analyze what types of companies 
            this person would be best suited to found. Consider their experience, skills, background, and online presence.
            
            Profile: {json.dumps(profile, indent=2)}
            Additional Information: {json.dumps(additional_info, indent=2)}
            
            Provide a detailed analysis including:
            1. Recommended industry sectors based on their expertise and background
            2. Type of company (B2B, B2C, etc.) that would align with their experience
            3. Key strengths that would contribute to success as a founder
            4. Potential challenges to consider based on their profile
            5. How their network and online presence could benefit their venture
            6. Specific opportunities or niches they might be well-positioned to address
            
            Be specific and provide actionable insights based on the information available.
            """
            
            try:
                response = self.llm.invoke([HumanMessage(content=analysis_prompt)])
                
                analysis_results.append({
                    "founder_name": name,
                    "analysis": response.content,
                    "profile": profile,
                    "additional_info": additional_info
                })
                
                # Rate limit between API calls
                if i < len(profiles) - 1:
                    await rate_limit()
                
            except Exception as e:
                await cl.Message(content=f"⚠️ Error analyzing {name}: {str(e)}").send()
        
        return {**state, "analysis_results": analysis_results}

    @async_timeout(30)  # 30 second timeout for formatting
    async def format_response(self, state: FounderAnalysisState) -> FounderAnalysisState:
        """Format the final response for display."""
        analysis_results = state["analysis_results"]
        
        await self.update_progress("Formatting final results...", 5, 5)
        
        # Clear the progress message
        self.progress_message = None
        
        if not analysis_results:
            if "error" in state and state["error"]:
                await cl.Message(content=f"❌ {state['error']}").send()
            else:
                await cl.Message(content="❌ No results to display.").send()
            return {**state, "final_response": {"status": "error", "message": state.get("error", "No results")}}
        
        for result in analysis_results:
            founder_name = result["founder_name"]
            profile = result["profile"]
            analysis = result["analysis"]
            
            # Build profile summary with basic information
            profile_summary = f"""
            🎯 Profile Summary:
            
            - Name: {profile.get('Full Name', '')}
            - Current Position: {profile.get('Current Position', '')}
            - Company: {profile.get('Company', '')}
            - Location: {profile.get('Location', '')}
            """
            
            # Add LinkedIn profile with proper URL formatting
            if profile.get('LinkedIn') and profile.get('LinkedIn').strip():
                linkedin_url = profile.get('LinkedIn')
                # Make sure the URL has the proper format
                if not linkedin_url.startswith('http'):
                    linkedin_url = f"https://{linkedin_url}"
                profile_summary += f"- LinkedIn: {linkedin_url}\n"
            
            # Add any other social profiles or websites
            if profile.get('Twitter') and profile.get('Twitter').strip():
                twitter_url = profile.get('Twitter')
                if not twitter_url.startswith('http'):
                    twitter_url = f"https://{twitter_url}"
                profile_summary += f"- Twitter: {twitter_url}\n"
                
            if profile.get('Website') and profile.get('Website').strip():
                website_url = profile.get('Website')
                if not website_url.startswith('http'):
                    website_url = f"https://{website_url}"
                profile_summary += f"- Website: {website_url}\n"
            
            # Format the analysis
            analysis_text = f"""
            📊 Analysis:
            
            {analysis}
            """
            
            # Create elements for structured display using Text instead of Markdown
            elements = [
                cl.Text(content=profile_summary),
                cl.Text(content=analysis_text)
            ]
            
            await cl.Message(
                content=f"Analysis for {founder_name}:",
                elements=elements
            ).send()
        
        await cl.Message(content="✅ Analysis complete!").send()
        
        return {**state, "final_response": {"status": "success", "results": analysis_results}}

    @async_timeout(120)  # 2 minute timeout for loading profiles
    async def load_profiles(self, file):
        """Load and embed founder profiles from uploaded CSV."""
        # Read CSV file
        df = pd.read_csv(file)
        
        # Convert DataFrame rows to list of dictionaries
        profiles = df.to_dict('records')
        
        # Create more comprehensive text representations for embedding
        texts = []
        for p in profiles:
            # Build a rich text representation including all available fields
            text_parts = []
            
            # Add core identity information
            if p.get('Full Name'):
                text_parts.append(f"Name: {p.get('Full Name')}")
            
            if p.get('Current Position'):
                text_parts.append(f"Position: {p.get('Current Position')}")
                
            if p.get('Company'):
                text_parts.append(f"Company: {p.get('Company')}")
                
            if p.get('Location'):
                text_parts.append(f"Location: {p.get('Location')}")
            
            # Add contact and social media information
            if p.get('LinkedIn'):
                text_parts.append(f"LinkedIn: {p.get('LinkedIn')}")
                
            if p.get('Twitter'):
                text_parts.append(f"Twitter: {p.get('Twitter')}")
                
            if p.get('Website'):
                text_parts.append(f"Website: {p.get('Website')}")
                
            if p.get('Email'):
                text_parts.append(f"Email: {p.get('Email')}")
            
            # Add detailed professional information
            if p.get('About'):
                text_parts.append(f"About: {p.get('About')}")
                
            if p.get('Skills'):
                text_parts.append(f"Skills: {p.get('Skills')}")
                
            if p.get('Experience'):
                text_parts.append(f"Experience: {p.get('Experience')}")
                
            if p.get('Education'):
                text_parts.append(f"Education: {p.get('Education')}")
            
            # Add any industry or sector information
            if p.get('Industry'):
                text_parts.append(f"Industry: {p.get('Industry')}")
                
            if p.get('Sector'):
                text_parts.append(f"Sector: {p.get('Sector')}")
            
            # Add any entrepreneurial information
            if p.get('Previous Startups'):
                text_parts.append(f"Previous Startups: {p.get('Previous Startups')}")
                
            if p.get('Funding History'):
                text_parts.append(f"Funding History: {p.get('Funding History')}")
            
            # Add any additional fields that might be in the CSV
            for key, value in p.items():
                if (key not in ['Full Name', 'Current Position', 'Company', 'Location', 
                               'LinkedIn', 'Twitter', 'Website', 'Email',
                               'About', 'Skills', 'Experience', 'Education', 
                               'Industry', 'Sector', 'Previous Startups', 'Funding History'] 
                    and value and str(value).lower() != 'nan'):
                    text_parts.append(f"{key}: {value}")
            
            # Join all parts with newlines for better separation
            text = "\n".join(text_parts)
            texts.append(text)
            
            # Log the first few profiles to help with debugging
            if len(texts) <= 3:
                print(f"Profile {len(texts)} text representation:\n{text}\n")
        
        # Generate embeddings
        embeddings = self.embeddings.embed_documents(texts)
        
        # Store in vector database
        self.vector_store.upsert_profiles(embeddings, profiles)
        
        return len(profiles)

    @async_timeout(300)  # 5 minute overall timeout for the entire process
    async def process_message(self, query: str):
        """Process a user message through the workflow."""
        # Reset progress message
        self.progress_message = None
        
        # Initialize the state as a simple dictionary
        state = {
            "query": query,
            "query_type": "",
            "filter_key": "",
            "filter_value": "",
            "retrieved_profiles": [],
            "web_search_results": [],
            "analysis_results": [],
            "final_response": {},
            "error": ""
        }
        
        try:
            # Manually execute the workflow nodes in sequence
            # First process the query
            state = await self.process_query(state)
            
            # Route based on query type
            next_node = self.query_router(state)
            
            if next_node == "error":
                await cl.Message(content=f"❌ {state['error']}").send()
                return
            
            # Execute the appropriate search method
            if next_node == "search":
                state = await self.vector_search(state)
            elif next_node == "filter":
                state = await self.filter_by_metadata(state)
            
            # Check for errors after search
            if state.get("error"):
                await cl.Message(content=f"❌ {state['error']}").send()
                return
            
            # Continue with the rest of the workflow
            state = await self.web_search(state)
            state = await self.analyze_profiles(state)
            state = await self.format_response(state)
            
        except asyncio.TimeoutError:
            await cl.Message(content="❌ The operation timed out. Please try a simpler query or try again later.").send()
        except Exception as e:
            await cl.Message(content=f"❌ Error processing request: {str(e)}").send()

# Initialize the system
system = FounderAnalysisSystem()

@cl.on_chat_start
async def start():
    """Initialize the chat session and prompt for CSV upload."""
    await cl.Message(
        content="👋 Welcome to the Founder Analysis System! Please upload your CSV file with founder profiles."
    ).send()
    
    files = await cl.AskFileMessage(
        content="Please upload your CSV file",
        accept=["text/csv"],
        max_size_mb=10
    ).send()

    if not files:
        await cl.Message(
            content="No file was uploaded. Please try again."
        ).send()
        return

    file = files[0]
    
    # Show loading message
    msg = cl.Message(content=f"⏳ Processing {file.name}...")
    await msg.send()

    try:
        # Load the profiles with timeout
        num_profiles = await asyncio.wait_for(system.load_profiles(file.path), timeout=120)
        
        await cl.Message(
            content=f"✅ Successfully loaded {num_profiles} founder profiles!\n\n" + 
                    "You can now:\n\n" + 
                    "1. **Search for founders by expertise**:\n" +
                    "   Example: `AI experts in healthcare`\n\n" +
                    "2. **Filter by specific fields**:\n" +
                    "   Example: `filter:Location:San Francisco`\n" +
                    "   Example: `filter:Skills:Machine Learning`\n\n" +
                    "3. **Search across all fields**:\n" +
                    "   Example: `filter:Stanford`\n" +
                    "   Example: `filter blockchain`\n\n" +
                    "4. **Get founder recommendations**:\n" +
                    "   Example: `recommend founders for fintech startup`"
        ).send()
    except asyncio.TimeoutError:
        await cl.Message(content="❌ Loading profiles timed out. The CSV file might be too large or complex.").send()
    except Exception as e:
        await cl.Message(content=f"❌ Error loading profiles: {str(e)}").send()

@cl.on_message
async def main(message: cl.Message):
    """Handle user messages and provide responses."""
    await system.process_message(message.content)