adding files
Browse files- Dockerfile +11 -0
- app.py +718 -0
- requirements.txt +16 -0
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
RUN useradd -m -u 1000 user
|
3 |
+
USER user
|
4 |
+
ENV HOME=/home/user \
|
5 |
+
PATH=/home/user/.local/bin:$PATH
|
6 |
+
WORKDIR $HOME/app
|
7 |
+
COPY --chown=user . $HOME/app
|
8 |
+
COPY ./requirements.txt ~/app/requirements.txt
|
9 |
+
RUN pip install -r requirements.txt
|
10 |
+
COPY . .
|
11 |
+
CMD ["chainlit", "run", "app.py", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,718 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import chainlit as cl
|
3 |
+
import pandas as pd
|
4 |
+
from typing import List, Dict, Any, TypedDict, Callable, Annotated, Literal, Optional, Union, Tuple
|
5 |
+
from qdrant_client import QdrantClient
|
6 |
+
from qdrant_client.models import Distance, VectorParams, PointStruct
|
7 |
+
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
|
8 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
9 |
+
from langgraph.graph import StateGraph, END
|
10 |
+
from langchain.tools import Tool
|
11 |
+
from tavily import TavilyClient
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
import json
|
14 |
+
import asyncio
|
15 |
+
import time
|
16 |
+
from functools import wraps
|
17 |
+
from pydantic import BaseModel, Field
|
18 |
+
from langchain_core.runnables import RunnableConfig
|
19 |
+
from langchain_core.runnables.utils import Output
|
20 |
+
from langchain_core.output_parsers import StrOutputParser
|
21 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
22 |
+
|
23 |
+
# Load environment variables
|
24 |
+
load_dotenv()
|
25 |
+
|
26 |
+
# Validate API keys
|
27 |
+
if not os.getenv("OPENAI_API_KEY"):
|
28 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables. Please add it to your .env file.")
|
29 |
+
if not os.getenv("TAVILY_API_KEY"):
|
30 |
+
raise ValueError("TAVILY_API_KEY not found in environment variables. Please add it to your .env file.")
|
31 |
+
|
32 |
+
# Configuration
|
33 |
+
COLLECTION_NAME = "founders"
|
34 |
+
VECTOR_DIM = 1536 # OpenAI embedding dimension
|
35 |
+
EMBEDDING_MODEL = "text-embedding-3-small"
|
36 |
+
LLM_MODEL = "gpt-4o-mini"
|
37 |
+
MAX_RELEVANT_CHUNKS = 3
|
38 |
+
SIMILARITY_THRESHOLD = 0.75
|
39 |
+
DEFAULT_TIMEOUT = 60 # Default timeout in seconds
|
40 |
+
API_RATE_LIMIT_DELAY = 1 # Delay between API calls in seconds
|
41 |
+
|
42 |
+
# Define state structure for LangGraph
|
43 |
+
class FounderAnalysisState(TypedDict):
|
44 |
+
query: str
|
45 |
+
query_type: str # "search" or "filter"
|
46 |
+
filter_key: str
|
47 |
+
filter_value: str
|
48 |
+
retrieved_profiles: List[Dict[str, Any]]
|
49 |
+
web_search_results: List[Dict[str, Any]]
|
50 |
+
analysis_results: List[Dict[str, Any]]
|
51 |
+
final_response: Dict[str, Any]
|
52 |
+
error: str
|
53 |
+
|
54 |
+
# Decorator for adding timeouts to async functions
|
55 |
+
def async_timeout(timeout_seconds=DEFAULT_TIMEOUT):
|
56 |
+
def decorator(func):
|
57 |
+
@wraps(func)
|
58 |
+
async def wrapper(*args, **kwargs):
|
59 |
+
try:
|
60 |
+
return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_seconds)
|
61 |
+
except asyncio.TimeoutError:
|
62 |
+
# Create a meaningful timeout message
|
63 |
+
func_name = func.__name__
|
64 |
+
await cl.Message(content=f"β±οΈ Operation timed out: {func_name} took longer than {timeout_seconds} seconds").send()
|
65 |
+
# Return appropriate error state if the function was expecting to return a state
|
66 |
+
if "state" in kwargs:
|
67 |
+
return {**kwargs["state"], "error": f"Operation timed out after {timeout_seconds} seconds"}
|
68 |
+
raise
|
69 |
+
return wrapper
|
70 |
+
return decorator
|
71 |
+
|
72 |
+
# Rate limiter for API calls
|
73 |
+
async def rate_limit():
|
74 |
+
"""Simple rate limiter to prevent API throttling"""
|
75 |
+
await asyncio.sleep(API_RATE_LIMIT_DELAY)
|
76 |
+
|
77 |
+
class VectorStore:
|
78 |
+
def __init__(self):
|
79 |
+
self.client = QdrantClient(":memory:") # In-memory Qdrant instance
|
80 |
+
self._create_collection()
|
81 |
+
|
82 |
+
def _create_collection(self):
|
83 |
+
"""Create the founders collection if it doesn't exist."""
|
84 |
+
self.client.recreate_collection(
|
85 |
+
collection_name=COLLECTION_NAME,
|
86 |
+
vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE)
|
87 |
+
)
|
88 |
+
|
89 |
+
def upsert_profiles(self, embeddings: List[List[float]], metadata: List[Dict[str, Any]]):
|
90 |
+
"""Upsert founder profiles with their embeddings and metadata."""
|
91 |
+
points = [
|
92 |
+
PointStruct(
|
93 |
+
id=idx,
|
94 |
+
vector=embedding,
|
95 |
+
payload=metadata[idx]
|
96 |
+
)
|
97 |
+
for idx, embedding in enumerate(embeddings)
|
98 |
+
]
|
99 |
+
self.client.upsert(
|
100 |
+
collection_name=COLLECTION_NAME,
|
101 |
+
points=points
|
102 |
+
)
|
103 |
+
|
104 |
+
def search_profiles(self, query_vector: List[float], limit: int = 5) -> List[Dict[str, Any]]:
|
105 |
+
"""Search for similar profiles using the query vector."""
|
106 |
+
results = self.client.search(
|
107 |
+
collection_name=COLLECTION_NAME,
|
108 |
+
query_vector=query_vector,
|
109 |
+
limit=limit
|
110 |
+
)
|
111 |
+
return [hit.payload for hit in results]
|
112 |
+
|
113 |
+
def get_profile_by_metadata(self, metadata_key: str, metadata_value: Any) -> List[Dict[str, Any]]:
|
114 |
+
"""Retrieve profiles based on metadata filtering."""
|
115 |
+
from qdrant_client.http import models as rest
|
116 |
+
|
117 |
+
filter_condition = rest.Filter(
|
118 |
+
must=[
|
119 |
+
rest.FieldCondition(
|
120 |
+
key=metadata_key,
|
121 |
+
match=rest.MatchValue(value=metadata_value)
|
122 |
+
)
|
123 |
+
]
|
124 |
+
)
|
125 |
+
|
126 |
+
results = self.client.scroll(
|
127 |
+
collection_name=COLLECTION_NAME,
|
128 |
+
scroll_filter=filter_condition
|
129 |
+
)[0]
|
130 |
+
|
131 |
+
return [point.payload for point in results]
|
132 |
+
|
133 |
+
class FounderAnalysisSystem:
|
134 |
+
def __init__(self):
|
135 |
+
self.embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
|
136 |
+
self.vector_store = VectorStore()
|
137 |
+
self.llm = ChatOpenAI(model=LLM_MODEL, timeout=DEFAULT_TIMEOUT)
|
138 |
+
self.tavily_client = TavilyClient()
|
139 |
+
self.workflow = self._create_workflow()
|
140 |
+
self.progress_message = None
|
141 |
+
|
142 |
+
def _create_workflow(self) -> StateGraph:
|
143 |
+
"""Create the LangGraph workflow for founder analysis."""
|
144 |
+
# Create a new graph
|
145 |
+
workflow = StateGraph(FounderAnalysisState)
|
146 |
+
|
147 |
+
# Add nodes to the graph
|
148 |
+
workflow.add_node("process_query", self.process_query)
|
149 |
+
workflow.add_node("vector_search", self.vector_search)
|
150 |
+
workflow.add_node("filter_by_metadata", self.filter_by_metadata)
|
151 |
+
workflow.add_node("web_search", self.web_search)
|
152 |
+
workflow.add_node("analyze_profiles", self.analyze_profiles)
|
153 |
+
workflow.add_node("format_response", self.format_response)
|
154 |
+
|
155 |
+
# Add conditional edges
|
156 |
+
workflow.add_conditional_edges(
|
157 |
+
"process_query",
|
158 |
+
self.query_router,
|
159 |
+
{
|
160 |
+
"search": "vector_search",
|
161 |
+
"filter": "filter_by_metadata",
|
162 |
+
"error": END
|
163 |
+
}
|
164 |
+
)
|
165 |
+
|
166 |
+
# Add standard edges
|
167 |
+
workflow.add_edge("vector_search", "web_search")
|
168 |
+
workflow.add_edge("filter_by_metadata", "web_search")
|
169 |
+
workflow.add_edge("web_search", "analyze_profiles")
|
170 |
+
workflow.add_edge("analyze_profiles", "format_response")
|
171 |
+
workflow.add_edge("format_response", END)
|
172 |
+
|
173 |
+
# Set entry point
|
174 |
+
workflow.set_entry_point("process_query")
|
175 |
+
|
176 |
+
return workflow
|
177 |
+
|
178 |
+
async def update_progress(self, message, step, total_steps):
|
179 |
+
"""Update the progress message to show the system is still working"""
|
180 |
+
progress_text = f"β³ {message} (Step {step}/{total_steps})"
|
181 |
+
if self.progress_message is None:
|
182 |
+
self.progress_message = cl.Message(content=progress_text)
|
183 |
+
await self.progress_message.send()
|
184 |
+
else:
|
185 |
+
# Fix: Use update() without content parameter, then set content property
|
186 |
+
await self.progress_message.update()
|
187 |
+
self.progress_message.content = progress_text
|
188 |
+
|
189 |
+
@async_timeout(30) # 30 second timeout for query processing
|
190 |
+
async def process_query(self, state: FounderAnalysisState) -> FounderAnalysisState:
|
191 |
+
"""Process the user query and determine the query type."""
|
192 |
+
query = state["query"]
|
193 |
+
|
194 |
+
# Log the processing step
|
195 |
+
await self.update_progress("Processing your query...", 1, 5)
|
196 |
+
|
197 |
+
# Check if it's a filter command
|
198 |
+
if query.lower().startswith("filter:") or query.lower().startswith("filter "):
|
199 |
+
# Remove the filter prefix and trim whitespace
|
200 |
+
filter_text = query.replace("filter:", "").replace("filter ", "").strip()
|
201 |
+
|
202 |
+
# Check if there's a colon separator for key:value format
|
203 |
+
if ":" in filter_text:
|
204 |
+
parts = filter_text.split(":", 1)
|
205 |
+
filter_key, filter_value = parts
|
206 |
+
|
207 |
+
# Provide a helpful message if the filter value is empty
|
208 |
+
if not filter_value.strip():
|
209 |
+
return {
|
210 |
+
**state,
|
211 |
+
"error": f"Please provide a value to filter by. Example: filter:{filter_key}:value"
|
212 |
+
}
|
213 |
+
|
214 |
+
return {
|
215 |
+
**state,
|
216 |
+
"query_type": "filter",
|
217 |
+
"filter_key": filter_key.strip(),
|
218 |
+
"filter_value": filter_value.strip()
|
219 |
+
}
|
220 |
+
else:
|
221 |
+
# If no specific key is provided, search across all fields
|
222 |
+
filter_value = filter_text
|
223 |
+
|
224 |
+
# Provide a helpful message if the filter value is empty
|
225 |
+
if not filter_value.strip():
|
226 |
+
return {
|
227 |
+
**state,
|
228 |
+
"error": "Please provide a value to filter by. Example: filter:Location:San Francisco"
|
229 |
+
}
|
230 |
+
|
231 |
+
return {
|
232 |
+
**state,
|
233 |
+
"query_type": "filter",
|
234 |
+
"filter_key": "all_fields", # Special value to indicate searching across all fields
|
235 |
+
"filter_value": filter_value.strip()
|
236 |
+
}
|
237 |
+
else:
|
238 |
+
return {**state, "query_type": "search"}
|
239 |
+
|
240 |
+
def query_router(self, state: FounderAnalysisState) -> str:
|
241 |
+
"""Route to the appropriate node based on query type."""
|
242 |
+
if "error" in state and state["error"]:
|
243 |
+
return "error"
|
244 |
+
return state["query_type"]
|
245 |
+
|
246 |
+
@async_timeout(45) # 45 second timeout for vector search
|
247 |
+
async def vector_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
|
248 |
+
"""Search for similar profiles using vector similarity."""
|
249 |
+
query = state["query"]
|
250 |
+
|
251 |
+
# Log the vector search step
|
252 |
+
await self.update_progress("Searching for relevant founder profiles...", 2, 5)
|
253 |
+
|
254 |
+
try:
|
255 |
+
# Convert query to embedding
|
256 |
+
query_embedding = self.embeddings.embed_query(query)
|
257 |
+
|
258 |
+
# Search for similar profiles
|
259 |
+
profiles = self.vector_store.search_profiles(query_embedding, limit=3)
|
260 |
+
|
261 |
+
if not profiles:
|
262 |
+
return {
|
263 |
+
**state,
|
264 |
+
"retrieved_profiles": [],
|
265 |
+
"error": "No matching profiles found."
|
266 |
+
}
|
267 |
+
|
268 |
+
return {**state, "retrieved_profiles": profiles}
|
269 |
+
except Exception as e:
|
270 |
+
return {**state, "error": f"Error during vector search: {str(e)}"}
|
271 |
+
|
272 |
+
@async_timeout(45) # 45 second timeout for metadata filtering
|
273 |
+
async def filter_by_metadata(self, state: FounderAnalysisState) -> FounderAnalysisState:
|
274 |
+
"""Filter profiles by metadata."""
|
275 |
+
filter_key = state["filter_key"]
|
276 |
+
filter_value = state["filter_value"]
|
277 |
+
|
278 |
+
# Log the filtering step
|
279 |
+
if filter_key == "all_fields":
|
280 |
+
await self.update_progress(f"Searching for '{filter_value}' across all profile fields...", 2, 5)
|
281 |
+
else:
|
282 |
+
await self.update_progress(f"Filtering profiles by {filter_key}: '{filter_value}'...", 2, 5)
|
283 |
+
|
284 |
+
try:
|
285 |
+
# Get all profiles first
|
286 |
+
from qdrant_client.http import models as rest
|
287 |
+
|
288 |
+
# Get all profiles from the collection
|
289 |
+
results = self.vector_store.client.scroll(
|
290 |
+
collection_name=COLLECTION_NAME,
|
291 |
+
limit=100 # Adjust this limit based on your expected dataset size
|
292 |
+
)[0]
|
293 |
+
|
294 |
+
all_profiles = [point.payload for point in results]
|
295 |
+
search_value = filter_value.lower()
|
296 |
+
|
297 |
+
# Perform flexible filtering in Python
|
298 |
+
filtered_profiles = []
|
299 |
+
|
300 |
+
# Special case for searching across all fields
|
301 |
+
if filter_key == "all_fields":
|
302 |
+
for profile in all_profiles:
|
303 |
+
# Search across all fields in the profile
|
304 |
+
for key, value in profile.items():
|
305 |
+
if value and search_value in str(value).lower():
|
306 |
+
filtered_profiles.append(profile)
|
307 |
+
break # Found a match, move to next profile
|
308 |
+
else:
|
309 |
+
# Regular field-specific search
|
310 |
+
for profile in all_profiles:
|
311 |
+
# Check if the key exists in the profile
|
312 |
+
if filter_key in profile:
|
313 |
+
profile_value = str(profile[filter_key]).lower()
|
314 |
+
|
315 |
+
# Check for partial match (case-insensitive)
|
316 |
+
if search_value in profile_value:
|
317 |
+
filtered_profiles.append(profile)
|
318 |
+
|
319 |
+
if not filtered_profiles:
|
320 |
+
if filter_key == "all_fields":
|
321 |
+
error_msg = f"No profiles found matching '{filter_value}' in any field"
|
322 |
+
else:
|
323 |
+
error_msg = f"No profiles found matching '{filter_value}' in {filter_key} field"
|
324 |
+
|
325 |
+
return {
|
326 |
+
**state,
|
327 |
+
"retrieved_profiles": [],
|
328 |
+
"error": error_msg
|
329 |
+
}
|
330 |
+
|
331 |
+
return {**state, "retrieved_profiles": filtered_profiles[:3]} # Limit to 3 profiles
|
332 |
+
except Exception as e:
|
333 |
+
return {**state, "error": f"Error during metadata filtering: {str(e)}"}
|
334 |
+
|
335 |
+
@async_timeout(90) # 90 second timeout for web search
|
336 |
+
async def web_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
|
337 |
+
"""Gather additional information from web search."""
|
338 |
+
profiles = state["retrieved_profiles"]
|
339 |
+
|
340 |
+
if not profiles:
|
341 |
+
return {**state, "web_search_results": []}
|
342 |
+
|
343 |
+
await self.update_progress("Gathering additional information from web search...", 3, 5)
|
344 |
+
|
345 |
+
web_search_results = []
|
346 |
+
|
347 |
+
for i, profile in enumerate(profiles):
|
348 |
+
name = profile.get("Full Name", "")
|
349 |
+
position = profile.get("Current Position", "")
|
350 |
+
company = profile.get("Company", "")
|
351 |
+
|
352 |
+
# Update progress for each profile
|
353 |
+
await self.update_progress(f"Searching web for info about {name} ({i+1}/{len(profiles)})...", 3, 5)
|
354 |
+
|
355 |
+
search_query = f"{name} {position} {company}"
|
356 |
+
try:
|
357 |
+
results = self.tavily_client.search(
|
358 |
+
query=search_query,
|
359 |
+
search_depth="advanced"
|
360 |
+
).get("results", [])
|
361 |
+
|
362 |
+
web_search_results.append({
|
363 |
+
"profile_name": name,
|
364 |
+
"search_results": results
|
365 |
+
})
|
366 |
+
|
367 |
+
# Rate limit between API calls
|
368 |
+
if i < len(profiles) - 1:
|
369 |
+
await rate_limit()
|
370 |
+
|
371 |
+
except Exception as e:
|
372 |
+
await cl.Message(content=f"β οΈ Error searching for {name}: {str(e)}").send()
|
373 |
+
|
374 |
+
return {**state, "web_search_results": web_search_results}
|
375 |
+
|
376 |
+
@async_timeout(120) # 2 minute timeout for analysis
|
377 |
+
async def analyze_profiles(self, state: FounderAnalysisState) -> FounderAnalysisState:
|
378 |
+
"""Analyze profiles with additional context."""
|
379 |
+
profiles = state["retrieved_profiles"]
|
380 |
+
web_results = state["web_search_results"]
|
381 |
+
|
382 |
+
if not profiles:
|
383 |
+
return {**state, "analysis_results": []}
|
384 |
+
|
385 |
+
await self.update_progress("Analyzing profiles and generating recommendations...", 4, 5)
|
386 |
+
|
387 |
+
analysis_results = []
|
388 |
+
|
389 |
+
for i, profile in enumerate(profiles):
|
390 |
+
name = profile.get("Full Name", "")
|
391 |
+
|
392 |
+
# Find matching web results
|
393 |
+
additional_info = []
|
394 |
+
for result in web_results:
|
395 |
+
if result["profile_name"] == name:
|
396 |
+
additional_info = result["search_results"]
|
397 |
+
break
|
398 |
+
|
399 |
+
# Update progress for each profile
|
400 |
+
await self.update_progress(f"Analyzing profile for {name} ({i+1}/{len(profiles)})...", 4, 5)
|
401 |
+
|
402 |
+
# Extract social media and online presence
|
403 |
+
linkedin = profile.get("LinkedIn", "")
|
404 |
+
twitter = profile.get("Twitter", "")
|
405 |
+
website = profile.get("Website", "")
|
406 |
+
|
407 |
+
analysis_prompt = f"""
|
408 |
+
Based on the following founder profile and additional information, analyze what types of companies
|
409 |
+
this person would be best suited to found. Consider their experience, skills, background, and online presence.
|
410 |
+
|
411 |
+
Profile: {json.dumps(profile, indent=2)}
|
412 |
+
Additional Information: {json.dumps(additional_info, indent=2)}
|
413 |
+
|
414 |
+
Provide a detailed analysis including:
|
415 |
+
1. Recommended industry sectors based on their expertise and background
|
416 |
+
2. Type of company (B2B, B2C, etc.) that would align with their experience
|
417 |
+
3. Key strengths that would contribute to success as a founder
|
418 |
+
4. Potential challenges to consider based on their profile
|
419 |
+
5. How their network and online presence could benefit their venture
|
420 |
+
6. Specific opportunities or niches they might be well-positioned to address
|
421 |
+
|
422 |
+
Be specific and provide actionable insights based on the information available.
|
423 |
+
"""
|
424 |
+
|
425 |
+
try:
|
426 |
+
response = self.llm.invoke([HumanMessage(content=analysis_prompt)])
|
427 |
+
|
428 |
+
analysis_results.append({
|
429 |
+
"founder_name": name,
|
430 |
+
"analysis": response.content,
|
431 |
+
"profile": profile,
|
432 |
+
"additional_info": additional_info
|
433 |
+
})
|
434 |
+
|
435 |
+
# Rate limit between API calls
|
436 |
+
if i < len(profiles) - 1:
|
437 |
+
await rate_limit()
|
438 |
+
|
439 |
+
except Exception as e:
|
440 |
+
await cl.Message(content=f"β οΈ Error analyzing {name}: {str(e)}").send()
|
441 |
+
|
442 |
+
return {**state, "analysis_results": analysis_results}
|
443 |
+
|
444 |
+
@async_timeout(30) # 30 second timeout for formatting
|
445 |
+
async def format_response(self, state: FounderAnalysisState) -> FounderAnalysisState:
|
446 |
+
"""Format the final response for display."""
|
447 |
+
analysis_results = state["analysis_results"]
|
448 |
+
|
449 |
+
await self.update_progress("Formatting final results...", 5, 5)
|
450 |
+
|
451 |
+
# Clear the progress message
|
452 |
+
self.progress_message = None
|
453 |
+
|
454 |
+
if not analysis_results:
|
455 |
+
if "error" in state and state["error"]:
|
456 |
+
await cl.Message(content=f"β {state['error']}").send()
|
457 |
+
else:
|
458 |
+
await cl.Message(content="β No results to display.").send()
|
459 |
+
return {**state, "final_response": {"status": "error", "message": state.get("error", "No results")}}
|
460 |
+
|
461 |
+
for result in analysis_results:
|
462 |
+
founder_name = result["founder_name"]
|
463 |
+
profile = result["profile"]
|
464 |
+
analysis = result["analysis"]
|
465 |
+
|
466 |
+
# Build profile summary with basic information
|
467 |
+
profile_summary = f"""
|
468 |
+
π― Profile Summary:
|
469 |
+
|
470 |
+
- Name: {profile.get('Full Name', '')}
|
471 |
+
- Current Position: {profile.get('Current Position', '')}
|
472 |
+
- Company: {profile.get('Company', '')}
|
473 |
+
- Location: {profile.get('Location', '')}
|
474 |
+
"""
|
475 |
+
|
476 |
+
# Add LinkedIn profile with proper URL formatting
|
477 |
+
if profile.get('LinkedIn') and profile.get('LinkedIn').strip():
|
478 |
+
linkedin_url = profile.get('LinkedIn')
|
479 |
+
# Make sure the URL has the proper format
|
480 |
+
if not linkedin_url.startswith('http'):
|
481 |
+
linkedin_url = f"https://{linkedin_url}"
|
482 |
+
profile_summary += f"- LinkedIn: {linkedin_url}\n"
|
483 |
+
|
484 |
+
# Add any other social profiles or websites
|
485 |
+
if profile.get('Twitter') and profile.get('Twitter').strip():
|
486 |
+
twitter_url = profile.get('Twitter')
|
487 |
+
if not twitter_url.startswith('http'):
|
488 |
+
twitter_url = f"https://{twitter_url}"
|
489 |
+
profile_summary += f"- Twitter: {twitter_url}\n"
|
490 |
+
|
491 |
+
if profile.get('Website') and profile.get('Website').strip():
|
492 |
+
website_url = profile.get('Website')
|
493 |
+
if not website_url.startswith('http'):
|
494 |
+
website_url = f"https://{website_url}"
|
495 |
+
profile_summary += f"- Website: {website_url}\n"
|
496 |
+
|
497 |
+
# Format the analysis
|
498 |
+
analysis_text = f"""
|
499 |
+
π Analysis:
|
500 |
+
|
501 |
+
{analysis}
|
502 |
+
"""
|
503 |
+
|
504 |
+
# Create elements for structured display using Text instead of Markdown
|
505 |
+
elements = [
|
506 |
+
cl.Text(content=profile_summary),
|
507 |
+
cl.Text(content=analysis_text)
|
508 |
+
]
|
509 |
+
|
510 |
+
await cl.Message(
|
511 |
+
content=f"Analysis for {founder_name}:",
|
512 |
+
elements=elements
|
513 |
+
).send()
|
514 |
+
|
515 |
+
await cl.Message(content="β
Analysis complete!").send()
|
516 |
+
|
517 |
+
return {**state, "final_response": {"status": "success", "results": analysis_results}}
|
518 |
+
|
519 |
+
@async_timeout(120) # 2 minute timeout for loading profiles
|
520 |
+
async def load_profiles(self, file):
|
521 |
+
"""Load and embed founder profiles from uploaded CSV."""
|
522 |
+
# Read CSV file
|
523 |
+
df = pd.read_csv(file)
|
524 |
+
|
525 |
+
# Convert DataFrame rows to list of dictionaries
|
526 |
+
profiles = df.to_dict('records')
|
527 |
+
|
528 |
+
# Create more comprehensive text representations for embedding
|
529 |
+
texts = []
|
530 |
+
for p in profiles:
|
531 |
+
# Build a rich text representation including all available fields
|
532 |
+
text_parts = []
|
533 |
+
|
534 |
+
# Add core identity information
|
535 |
+
if p.get('Full Name'):
|
536 |
+
text_parts.append(f"Name: {p.get('Full Name')}")
|
537 |
+
|
538 |
+
if p.get('Current Position'):
|
539 |
+
text_parts.append(f"Position: {p.get('Current Position')}")
|
540 |
+
|
541 |
+
if p.get('Company'):
|
542 |
+
text_parts.append(f"Company: {p.get('Company')}")
|
543 |
+
|
544 |
+
if p.get('Location'):
|
545 |
+
text_parts.append(f"Location: {p.get('Location')}")
|
546 |
+
|
547 |
+
# Add contact and social media information
|
548 |
+
if p.get('LinkedIn'):
|
549 |
+
text_parts.append(f"LinkedIn: {p.get('LinkedIn')}")
|
550 |
+
|
551 |
+
if p.get('Twitter'):
|
552 |
+
text_parts.append(f"Twitter: {p.get('Twitter')}")
|
553 |
+
|
554 |
+
if p.get('Website'):
|
555 |
+
text_parts.append(f"Website: {p.get('Website')}")
|
556 |
+
|
557 |
+
if p.get('Email'):
|
558 |
+
text_parts.append(f"Email: {p.get('Email')}")
|
559 |
+
|
560 |
+
# Add detailed professional information
|
561 |
+
if p.get('About'):
|
562 |
+
text_parts.append(f"About: {p.get('About')}")
|
563 |
+
|
564 |
+
if p.get('Skills'):
|
565 |
+
text_parts.append(f"Skills: {p.get('Skills')}")
|
566 |
+
|
567 |
+
if p.get('Experience'):
|
568 |
+
text_parts.append(f"Experience: {p.get('Experience')}")
|
569 |
+
|
570 |
+
if p.get('Education'):
|
571 |
+
text_parts.append(f"Education: {p.get('Education')}")
|
572 |
+
|
573 |
+
# Add any industry or sector information
|
574 |
+
if p.get('Industry'):
|
575 |
+
text_parts.append(f"Industry: {p.get('Industry')}")
|
576 |
+
|
577 |
+
if p.get('Sector'):
|
578 |
+
text_parts.append(f"Sector: {p.get('Sector')}")
|
579 |
+
|
580 |
+
# Add any entrepreneurial information
|
581 |
+
if p.get('Previous Startups'):
|
582 |
+
text_parts.append(f"Previous Startups: {p.get('Previous Startups')}")
|
583 |
+
|
584 |
+
if p.get('Funding History'):
|
585 |
+
text_parts.append(f"Funding History: {p.get('Funding History')}")
|
586 |
+
|
587 |
+
# Add any additional fields that might be in the CSV
|
588 |
+
for key, value in p.items():
|
589 |
+
if (key not in ['Full Name', 'Current Position', 'Company', 'Location',
|
590 |
+
'LinkedIn', 'Twitter', 'Website', 'Email',
|
591 |
+
'About', 'Skills', 'Experience', 'Education',
|
592 |
+
'Industry', 'Sector', 'Previous Startups', 'Funding History']
|
593 |
+
and value and str(value).lower() != 'nan'):
|
594 |
+
text_parts.append(f"{key}: {value}")
|
595 |
+
|
596 |
+
# Join all parts with newlines for better separation
|
597 |
+
text = "\n".join(text_parts)
|
598 |
+
texts.append(text)
|
599 |
+
|
600 |
+
# Log the first few profiles to help with debugging
|
601 |
+
if len(texts) <= 3:
|
602 |
+
print(f"Profile {len(texts)} text representation:\n{text}\n")
|
603 |
+
|
604 |
+
# Generate embeddings
|
605 |
+
embeddings = self.embeddings.embed_documents(texts)
|
606 |
+
|
607 |
+
# Store in vector database
|
608 |
+
self.vector_store.upsert_profiles(embeddings, profiles)
|
609 |
+
|
610 |
+
return len(profiles)
|
611 |
+
|
612 |
+
@async_timeout(300) # 5 minute overall timeout for the entire process
|
613 |
+
async def process_message(self, query: str):
|
614 |
+
"""Process a user message through the workflow."""
|
615 |
+
# Reset progress message
|
616 |
+
self.progress_message = None
|
617 |
+
|
618 |
+
# Initialize the state
|
619 |
+
state = FounderAnalysisState(
|
620 |
+
query=query,
|
621 |
+
query_type="",
|
622 |
+
filter_key="",
|
623 |
+
filter_value="",
|
624 |
+
retrieved_profiles=[],
|
625 |
+
web_search_results=[],
|
626 |
+
analysis_results=[],
|
627 |
+
final_response={},
|
628 |
+
error=""
|
629 |
+
)
|
630 |
+
|
631 |
+
try:
|
632 |
+
# Manually execute the workflow nodes in sequence
|
633 |
+
# First process the query
|
634 |
+
state = await self.process_query(state)
|
635 |
+
|
636 |
+
# Route based on query type
|
637 |
+
next_node = self.query_router(state)
|
638 |
+
|
639 |
+
if next_node == "error":
|
640 |
+
await cl.Message(content=f"β {state['error']}").send()
|
641 |
+
return
|
642 |
+
|
643 |
+
# Execute the appropriate search method
|
644 |
+
if next_node == "search":
|
645 |
+
state = await self.vector_search(state)
|
646 |
+
elif next_node == "filter":
|
647 |
+
state = await self.filter_by_metadata(state)
|
648 |
+
|
649 |
+
# Check for errors after search
|
650 |
+
if state.get("error"):
|
651 |
+
await cl.Message(content=f"β {state['error']}").send()
|
652 |
+
return
|
653 |
+
|
654 |
+
# Continue with the rest of the workflow
|
655 |
+
state = await self.web_search(state)
|
656 |
+
state = await self.analyze_profiles(state)
|
657 |
+
state = await self.format_response(state)
|
658 |
+
|
659 |
+
except asyncio.TimeoutError:
|
660 |
+
await cl.Message(content="β The operation timed out. Please try a simpler query or try again later.").send()
|
661 |
+
except Exception as e:
|
662 |
+
await cl.Message(content=f"β Error processing request: {str(e)}").send()
|
663 |
+
|
664 |
+
# Initialize the system
|
665 |
+
system = FounderAnalysisSystem()
|
666 |
+
|
667 |
+
@cl.on_chat_start
|
668 |
+
async def start():
|
669 |
+
"""Initialize the chat session and prompt for CSV upload."""
|
670 |
+
await cl.Message(
|
671 |
+
content="π Welcome to the Founder Analysis System! Please upload your CSV file with founder profiles."
|
672 |
+
).send()
|
673 |
+
|
674 |
+
files = await cl.AskFileMessage(
|
675 |
+
content="Please upload your CSV file",
|
676 |
+
accept=["text/csv"],
|
677 |
+
max_size_mb=10
|
678 |
+
).send()
|
679 |
+
|
680 |
+
if not files:
|
681 |
+
await cl.Message(
|
682 |
+
content="No file was uploaded. Please try again."
|
683 |
+
).send()
|
684 |
+
return
|
685 |
+
|
686 |
+
file = files[0]
|
687 |
+
|
688 |
+
# Show loading message
|
689 |
+
msg = cl.Message(content=f"β³ Processing {file.name}...")
|
690 |
+
await msg.send()
|
691 |
+
|
692 |
+
try:
|
693 |
+
# Load the profiles with timeout
|
694 |
+
num_profiles = await asyncio.wait_for(system.load_profiles(file.path), timeout=120)
|
695 |
+
|
696 |
+
await cl.Message(
|
697 |
+
content=f"β
Successfully loaded {num_profiles} founder profiles!\n\n" +
|
698 |
+
"You can now:\n\n" +
|
699 |
+
"1. **Search for founders by expertise**:\n" +
|
700 |
+
" Example: `AI experts in healthcare`\n\n" +
|
701 |
+
"2. **Filter by specific fields**:\n" +
|
702 |
+
" Example: `filter:Location:San Francisco`\n" +
|
703 |
+
" Example: `filter:Skills:Machine Learning`\n\n" +
|
704 |
+
"3. **Search across all fields**:\n" +
|
705 |
+
" Example: `filter:Stanford`\n" +
|
706 |
+
" Example: `filter blockchain`\n\n" +
|
707 |
+
"4. **Get founder recommendations**:\n" +
|
708 |
+
" Example: `recommend founders for fintech startup`"
|
709 |
+
).send()
|
710 |
+
except asyncio.TimeoutError:
|
711 |
+
await cl.Message(content="β Loading profiles timed out. The CSV file might be too large or complex.").send()
|
712 |
+
except Exception as e:
|
713 |
+
await cl.Message(content=f"β Error loading profiles: {str(e)}").send()
|
714 |
+
|
715 |
+
@cl.on_message
|
716 |
+
async def main(message: cl.Message):
|
717 |
+
"""Handle user messages and provide responses."""
|
718 |
+
await system.process_message(message.content)
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
chainlit>=0.7.700
|
2 |
+
langchain>=0.1.0
|
3 |
+
langchain-openai>=0.0.5
|
4 |
+
langgraph>=0.0.19
|
5 |
+
langchain-experimental>=0.0.49
|
6 |
+
qdrant-client>=1.7.0
|
7 |
+
pandas>=2.0.0
|
8 |
+
python-dotenv>=1.0.0
|
9 |
+
tavily-python>=0.3.0
|
10 |
+
openai>=1.12.0
|
11 |
+
pydantic>=2.0.0
|
12 |
+
numpy>=1.24.0
|
13 |
+
tqdm>=4.65.0
|
14 |
+
typing-extensions>=4.5.0
|
15 |
+
aiohttp>=3.9.0
|
16 |
+
langchain-core>=0.1.0
|