ngiometti commited on
Commit
0885c85
Β·
1 Parent(s): 3de41a7

adding files

Browse files
Files changed (3) hide show
  1. Dockerfile +11 -0
  2. app.py +718 -0
  3. requirements.txt +16 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,718 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chainlit as cl
3
+ import pandas as pd
4
+ from typing import List, Dict, Any, TypedDict, Callable, Annotated, Literal, Optional, Union, Tuple
5
+ from qdrant_client import QdrantClient
6
+ from qdrant_client.models import Distance, VectorParams, PointStruct
7
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
8
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
9
+ from langgraph.graph import StateGraph, END
10
+ from langchain.tools import Tool
11
+ from tavily import TavilyClient
12
+ from dotenv import load_dotenv
13
+ import json
14
+ import asyncio
15
+ import time
16
+ from functools import wraps
17
+ from pydantic import BaseModel, Field
18
+ from langchain_core.runnables import RunnableConfig
19
+ from langchain_core.runnables.utils import Output
20
+ from langchain_core.output_parsers import StrOutputParser
21
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
22
+
23
+ # Load environment variables
24
+ load_dotenv()
25
+
26
+ # Validate API keys
27
+ if not os.getenv("OPENAI_API_KEY"):
28
+ raise ValueError("OPENAI_API_KEY not found in environment variables. Please add it to your .env file.")
29
+ if not os.getenv("TAVILY_API_KEY"):
30
+ raise ValueError("TAVILY_API_KEY not found in environment variables. Please add it to your .env file.")
31
+
32
+ # Configuration
33
+ COLLECTION_NAME = "founders"
34
+ VECTOR_DIM = 1536 # OpenAI embedding dimension
35
+ EMBEDDING_MODEL = "text-embedding-3-small"
36
+ LLM_MODEL = "gpt-4o-mini"
37
+ MAX_RELEVANT_CHUNKS = 3
38
+ SIMILARITY_THRESHOLD = 0.75
39
+ DEFAULT_TIMEOUT = 60 # Default timeout in seconds
40
+ API_RATE_LIMIT_DELAY = 1 # Delay between API calls in seconds
41
+
42
+ # Define state structure for LangGraph
43
+ class FounderAnalysisState(TypedDict):
44
+ query: str
45
+ query_type: str # "search" or "filter"
46
+ filter_key: str
47
+ filter_value: str
48
+ retrieved_profiles: List[Dict[str, Any]]
49
+ web_search_results: List[Dict[str, Any]]
50
+ analysis_results: List[Dict[str, Any]]
51
+ final_response: Dict[str, Any]
52
+ error: str
53
+
54
+ # Decorator for adding timeouts to async functions
55
+ def async_timeout(timeout_seconds=DEFAULT_TIMEOUT):
56
+ def decorator(func):
57
+ @wraps(func)
58
+ async def wrapper(*args, **kwargs):
59
+ try:
60
+ return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_seconds)
61
+ except asyncio.TimeoutError:
62
+ # Create a meaningful timeout message
63
+ func_name = func.__name__
64
+ await cl.Message(content=f"⏱️ Operation timed out: {func_name} took longer than {timeout_seconds} seconds").send()
65
+ # Return appropriate error state if the function was expecting to return a state
66
+ if "state" in kwargs:
67
+ return {**kwargs["state"], "error": f"Operation timed out after {timeout_seconds} seconds"}
68
+ raise
69
+ return wrapper
70
+ return decorator
71
+
72
+ # Rate limiter for API calls
73
+ async def rate_limit():
74
+ """Simple rate limiter to prevent API throttling"""
75
+ await asyncio.sleep(API_RATE_LIMIT_DELAY)
76
+
77
+ class VectorStore:
78
+ def __init__(self):
79
+ self.client = QdrantClient(":memory:") # In-memory Qdrant instance
80
+ self._create_collection()
81
+
82
+ def _create_collection(self):
83
+ """Create the founders collection if it doesn't exist."""
84
+ self.client.recreate_collection(
85
+ collection_name=COLLECTION_NAME,
86
+ vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE)
87
+ )
88
+
89
+ def upsert_profiles(self, embeddings: List[List[float]], metadata: List[Dict[str, Any]]):
90
+ """Upsert founder profiles with their embeddings and metadata."""
91
+ points = [
92
+ PointStruct(
93
+ id=idx,
94
+ vector=embedding,
95
+ payload=metadata[idx]
96
+ )
97
+ for idx, embedding in enumerate(embeddings)
98
+ ]
99
+ self.client.upsert(
100
+ collection_name=COLLECTION_NAME,
101
+ points=points
102
+ )
103
+
104
+ def search_profiles(self, query_vector: List[float], limit: int = 5) -> List[Dict[str, Any]]:
105
+ """Search for similar profiles using the query vector."""
106
+ results = self.client.search(
107
+ collection_name=COLLECTION_NAME,
108
+ query_vector=query_vector,
109
+ limit=limit
110
+ )
111
+ return [hit.payload for hit in results]
112
+
113
+ def get_profile_by_metadata(self, metadata_key: str, metadata_value: Any) -> List[Dict[str, Any]]:
114
+ """Retrieve profiles based on metadata filtering."""
115
+ from qdrant_client.http import models as rest
116
+
117
+ filter_condition = rest.Filter(
118
+ must=[
119
+ rest.FieldCondition(
120
+ key=metadata_key,
121
+ match=rest.MatchValue(value=metadata_value)
122
+ )
123
+ ]
124
+ )
125
+
126
+ results = self.client.scroll(
127
+ collection_name=COLLECTION_NAME,
128
+ scroll_filter=filter_condition
129
+ )[0]
130
+
131
+ return [point.payload for point in results]
132
+
133
+ class FounderAnalysisSystem:
134
+ def __init__(self):
135
+ self.embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
136
+ self.vector_store = VectorStore()
137
+ self.llm = ChatOpenAI(model=LLM_MODEL, timeout=DEFAULT_TIMEOUT)
138
+ self.tavily_client = TavilyClient()
139
+ self.workflow = self._create_workflow()
140
+ self.progress_message = None
141
+
142
+ def _create_workflow(self) -> StateGraph:
143
+ """Create the LangGraph workflow for founder analysis."""
144
+ # Create a new graph
145
+ workflow = StateGraph(FounderAnalysisState)
146
+
147
+ # Add nodes to the graph
148
+ workflow.add_node("process_query", self.process_query)
149
+ workflow.add_node("vector_search", self.vector_search)
150
+ workflow.add_node("filter_by_metadata", self.filter_by_metadata)
151
+ workflow.add_node("web_search", self.web_search)
152
+ workflow.add_node("analyze_profiles", self.analyze_profiles)
153
+ workflow.add_node("format_response", self.format_response)
154
+
155
+ # Add conditional edges
156
+ workflow.add_conditional_edges(
157
+ "process_query",
158
+ self.query_router,
159
+ {
160
+ "search": "vector_search",
161
+ "filter": "filter_by_metadata",
162
+ "error": END
163
+ }
164
+ )
165
+
166
+ # Add standard edges
167
+ workflow.add_edge("vector_search", "web_search")
168
+ workflow.add_edge("filter_by_metadata", "web_search")
169
+ workflow.add_edge("web_search", "analyze_profiles")
170
+ workflow.add_edge("analyze_profiles", "format_response")
171
+ workflow.add_edge("format_response", END)
172
+
173
+ # Set entry point
174
+ workflow.set_entry_point("process_query")
175
+
176
+ return workflow
177
+
178
+ async def update_progress(self, message, step, total_steps):
179
+ """Update the progress message to show the system is still working"""
180
+ progress_text = f"⏳ {message} (Step {step}/{total_steps})"
181
+ if self.progress_message is None:
182
+ self.progress_message = cl.Message(content=progress_text)
183
+ await self.progress_message.send()
184
+ else:
185
+ # Fix: Use update() without content parameter, then set content property
186
+ await self.progress_message.update()
187
+ self.progress_message.content = progress_text
188
+
189
+ @async_timeout(30) # 30 second timeout for query processing
190
+ async def process_query(self, state: FounderAnalysisState) -> FounderAnalysisState:
191
+ """Process the user query and determine the query type."""
192
+ query = state["query"]
193
+
194
+ # Log the processing step
195
+ await self.update_progress("Processing your query...", 1, 5)
196
+
197
+ # Check if it's a filter command
198
+ if query.lower().startswith("filter:") or query.lower().startswith("filter "):
199
+ # Remove the filter prefix and trim whitespace
200
+ filter_text = query.replace("filter:", "").replace("filter ", "").strip()
201
+
202
+ # Check if there's a colon separator for key:value format
203
+ if ":" in filter_text:
204
+ parts = filter_text.split(":", 1)
205
+ filter_key, filter_value = parts
206
+
207
+ # Provide a helpful message if the filter value is empty
208
+ if not filter_value.strip():
209
+ return {
210
+ **state,
211
+ "error": f"Please provide a value to filter by. Example: filter:{filter_key}:value"
212
+ }
213
+
214
+ return {
215
+ **state,
216
+ "query_type": "filter",
217
+ "filter_key": filter_key.strip(),
218
+ "filter_value": filter_value.strip()
219
+ }
220
+ else:
221
+ # If no specific key is provided, search across all fields
222
+ filter_value = filter_text
223
+
224
+ # Provide a helpful message if the filter value is empty
225
+ if not filter_value.strip():
226
+ return {
227
+ **state,
228
+ "error": "Please provide a value to filter by. Example: filter:Location:San Francisco"
229
+ }
230
+
231
+ return {
232
+ **state,
233
+ "query_type": "filter",
234
+ "filter_key": "all_fields", # Special value to indicate searching across all fields
235
+ "filter_value": filter_value.strip()
236
+ }
237
+ else:
238
+ return {**state, "query_type": "search"}
239
+
240
+ def query_router(self, state: FounderAnalysisState) -> str:
241
+ """Route to the appropriate node based on query type."""
242
+ if "error" in state and state["error"]:
243
+ return "error"
244
+ return state["query_type"]
245
+
246
+ @async_timeout(45) # 45 second timeout for vector search
247
+ async def vector_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
248
+ """Search for similar profiles using vector similarity."""
249
+ query = state["query"]
250
+
251
+ # Log the vector search step
252
+ await self.update_progress("Searching for relevant founder profiles...", 2, 5)
253
+
254
+ try:
255
+ # Convert query to embedding
256
+ query_embedding = self.embeddings.embed_query(query)
257
+
258
+ # Search for similar profiles
259
+ profiles = self.vector_store.search_profiles(query_embedding, limit=3)
260
+
261
+ if not profiles:
262
+ return {
263
+ **state,
264
+ "retrieved_profiles": [],
265
+ "error": "No matching profiles found."
266
+ }
267
+
268
+ return {**state, "retrieved_profiles": profiles}
269
+ except Exception as e:
270
+ return {**state, "error": f"Error during vector search: {str(e)}"}
271
+
272
+ @async_timeout(45) # 45 second timeout for metadata filtering
273
+ async def filter_by_metadata(self, state: FounderAnalysisState) -> FounderAnalysisState:
274
+ """Filter profiles by metadata."""
275
+ filter_key = state["filter_key"]
276
+ filter_value = state["filter_value"]
277
+
278
+ # Log the filtering step
279
+ if filter_key == "all_fields":
280
+ await self.update_progress(f"Searching for '{filter_value}' across all profile fields...", 2, 5)
281
+ else:
282
+ await self.update_progress(f"Filtering profiles by {filter_key}: '{filter_value}'...", 2, 5)
283
+
284
+ try:
285
+ # Get all profiles first
286
+ from qdrant_client.http import models as rest
287
+
288
+ # Get all profiles from the collection
289
+ results = self.vector_store.client.scroll(
290
+ collection_name=COLLECTION_NAME,
291
+ limit=100 # Adjust this limit based on your expected dataset size
292
+ )[0]
293
+
294
+ all_profiles = [point.payload for point in results]
295
+ search_value = filter_value.lower()
296
+
297
+ # Perform flexible filtering in Python
298
+ filtered_profiles = []
299
+
300
+ # Special case for searching across all fields
301
+ if filter_key == "all_fields":
302
+ for profile in all_profiles:
303
+ # Search across all fields in the profile
304
+ for key, value in profile.items():
305
+ if value and search_value in str(value).lower():
306
+ filtered_profiles.append(profile)
307
+ break # Found a match, move to next profile
308
+ else:
309
+ # Regular field-specific search
310
+ for profile in all_profiles:
311
+ # Check if the key exists in the profile
312
+ if filter_key in profile:
313
+ profile_value = str(profile[filter_key]).lower()
314
+
315
+ # Check for partial match (case-insensitive)
316
+ if search_value in profile_value:
317
+ filtered_profiles.append(profile)
318
+
319
+ if not filtered_profiles:
320
+ if filter_key == "all_fields":
321
+ error_msg = f"No profiles found matching '{filter_value}' in any field"
322
+ else:
323
+ error_msg = f"No profiles found matching '{filter_value}' in {filter_key} field"
324
+
325
+ return {
326
+ **state,
327
+ "retrieved_profiles": [],
328
+ "error": error_msg
329
+ }
330
+
331
+ return {**state, "retrieved_profiles": filtered_profiles[:3]} # Limit to 3 profiles
332
+ except Exception as e:
333
+ return {**state, "error": f"Error during metadata filtering: {str(e)}"}
334
+
335
+ @async_timeout(90) # 90 second timeout for web search
336
+ async def web_search(self, state: FounderAnalysisState) -> FounderAnalysisState:
337
+ """Gather additional information from web search."""
338
+ profiles = state["retrieved_profiles"]
339
+
340
+ if not profiles:
341
+ return {**state, "web_search_results": []}
342
+
343
+ await self.update_progress("Gathering additional information from web search...", 3, 5)
344
+
345
+ web_search_results = []
346
+
347
+ for i, profile in enumerate(profiles):
348
+ name = profile.get("Full Name", "")
349
+ position = profile.get("Current Position", "")
350
+ company = profile.get("Company", "")
351
+
352
+ # Update progress for each profile
353
+ await self.update_progress(f"Searching web for info about {name} ({i+1}/{len(profiles)})...", 3, 5)
354
+
355
+ search_query = f"{name} {position} {company}"
356
+ try:
357
+ results = self.tavily_client.search(
358
+ query=search_query,
359
+ search_depth="advanced"
360
+ ).get("results", [])
361
+
362
+ web_search_results.append({
363
+ "profile_name": name,
364
+ "search_results": results
365
+ })
366
+
367
+ # Rate limit between API calls
368
+ if i < len(profiles) - 1:
369
+ await rate_limit()
370
+
371
+ except Exception as e:
372
+ await cl.Message(content=f"⚠️ Error searching for {name}: {str(e)}").send()
373
+
374
+ return {**state, "web_search_results": web_search_results}
375
+
376
+ @async_timeout(120) # 2 minute timeout for analysis
377
+ async def analyze_profiles(self, state: FounderAnalysisState) -> FounderAnalysisState:
378
+ """Analyze profiles with additional context."""
379
+ profiles = state["retrieved_profiles"]
380
+ web_results = state["web_search_results"]
381
+
382
+ if not profiles:
383
+ return {**state, "analysis_results": []}
384
+
385
+ await self.update_progress("Analyzing profiles and generating recommendations...", 4, 5)
386
+
387
+ analysis_results = []
388
+
389
+ for i, profile in enumerate(profiles):
390
+ name = profile.get("Full Name", "")
391
+
392
+ # Find matching web results
393
+ additional_info = []
394
+ for result in web_results:
395
+ if result["profile_name"] == name:
396
+ additional_info = result["search_results"]
397
+ break
398
+
399
+ # Update progress for each profile
400
+ await self.update_progress(f"Analyzing profile for {name} ({i+1}/{len(profiles)})...", 4, 5)
401
+
402
+ # Extract social media and online presence
403
+ linkedin = profile.get("LinkedIn", "")
404
+ twitter = profile.get("Twitter", "")
405
+ website = profile.get("Website", "")
406
+
407
+ analysis_prompt = f"""
408
+ Based on the following founder profile and additional information, analyze what types of companies
409
+ this person would be best suited to found. Consider their experience, skills, background, and online presence.
410
+
411
+ Profile: {json.dumps(profile, indent=2)}
412
+ Additional Information: {json.dumps(additional_info, indent=2)}
413
+
414
+ Provide a detailed analysis including:
415
+ 1. Recommended industry sectors based on their expertise and background
416
+ 2. Type of company (B2B, B2C, etc.) that would align with their experience
417
+ 3. Key strengths that would contribute to success as a founder
418
+ 4. Potential challenges to consider based on their profile
419
+ 5. How their network and online presence could benefit their venture
420
+ 6. Specific opportunities or niches they might be well-positioned to address
421
+
422
+ Be specific and provide actionable insights based on the information available.
423
+ """
424
+
425
+ try:
426
+ response = self.llm.invoke([HumanMessage(content=analysis_prompt)])
427
+
428
+ analysis_results.append({
429
+ "founder_name": name,
430
+ "analysis": response.content,
431
+ "profile": profile,
432
+ "additional_info": additional_info
433
+ })
434
+
435
+ # Rate limit between API calls
436
+ if i < len(profiles) - 1:
437
+ await rate_limit()
438
+
439
+ except Exception as e:
440
+ await cl.Message(content=f"⚠️ Error analyzing {name}: {str(e)}").send()
441
+
442
+ return {**state, "analysis_results": analysis_results}
443
+
444
+ @async_timeout(30) # 30 second timeout for formatting
445
+ async def format_response(self, state: FounderAnalysisState) -> FounderAnalysisState:
446
+ """Format the final response for display."""
447
+ analysis_results = state["analysis_results"]
448
+
449
+ await self.update_progress("Formatting final results...", 5, 5)
450
+
451
+ # Clear the progress message
452
+ self.progress_message = None
453
+
454
+ if not analysis_results:
455
+ if "error" in state and state["error"]:
456
+ await cl.Message(content=f"❌ {state['error']}").send()
457
+ else:
458
+ await cl.Message(content="❌ No results to display.").send()
459
+ return {**state, "final_response": {"status": "error", "message": state.get("error", "No results")}}
460
+
461
+ for result in analysis_results:
462
+ founder_name = result["founder_name"]
463
+ profile = result["profile"]
464
+ analysis = result["analysis"]
465
+
466
+ # Build profile summary with basic information
467
+ profile_summary = f"""
468
+ 🎯 Profile Summary:
469
+
470
+ - Name: {profile.get('Full Name', '')}
471
+ - Current Position: {profile.get('Current Position', '')}
472
+ - Company: {profile.get('Company', '')}
473
+ - Location: {profile.get('Location', '')}
474
+ """
475
+
476
+ # Add LinkedIn profile with proper URL formatting
477
+ if profile.get('LinkedIn') and profile.get('LinkedIn').strip():
478
+ linkedin_url = profile.get('LinkedIn')
479
+ # Make sure the URL has the proper format
480
+ if not linkedin_url.startswith('http'):
481
+ linkedin_url = f"https://{linkedin_url}"
482
+ profile_summary += f"- LinkedIn: {linkedin_url}\n"
483
+
484
+ # Add any other social profiles or websites
485
+ if profile.get('Twitter') and profile.get('Twitter').strip():
486
+ twitter_url = profile.get('Twitter')
487
+ if not twitter_url.startswith('http'):
488
+ twitter_url = f"https://{twitter_url}"
489
+ profile_summary += f"- Twitter: {twitter_url}\n"
490
+
491
+ if profile.get('Website') and profile.get('Website').strip():
492
+ website_url = profile.get('Website')
493
+ if not website_url.startswith('http'):
494
+ website_url = f"https://{website_url}"
495
+ profile_summary += f"- Website: {website_url}\n"
496
+
497
+ # Format the analysis
498
+ analysis_text = f"""
499
+ πŸ“Š Analysis:
500
+
501
+ {analysis}
502
+ """
503
+
504
+ # Create elements for structured display using Text instead of Markdown
505
+ elements = [
506
+ cl.Text(content=profile_summary),
507
+ cl.Text(content=analysis_text)
508
+ ]
509
+
510
+ await cl.Message(
511
+ content=f"Analysis for {founder_name}:",
512
+ elements=elements
513
+ ).send()
514
+
515
+ await cl.Message(content="βœ… Analysis complete!").send()
516
+
517
+ return {**state, "final_response": {"status": "success", "results": analysis_results}}
518
+
519
+ @async_timeout(120) # 2 minute timeout for loading profiles
520
+ async def load_profiles(self, file):
521
+ """Load and embed founder profiles from uploaded CSV."""
522
+ # Read CSV file
523
+ df = pd.read_csv(file)
524
+
525
+ # Convert DataFrame rows to list of dictionaries
526
+ profiles = df.to_dict('records')
527
+
528
+ # Create more comprehensive text representations for embedding
529
+ texts = []
530
+ for p in profiles:
531
+ # Build a rich text representation including all available fields
532
+ text_parts = []
533
+
534
+ # Add core identity information
535
+ if p.get('Full Name'):
536
+ text_parts.append(f"Name: {p.get('Full Name')}")
537
+
538
+ if p.get('Current Position'):
539
+ text_parts.append(f"Position: {p.get('Current Position')}")
540
+
541
+ if p.get('Company'):
542
+ text_parts.append(f"Company: {p.get('Company')}")
543
+
544
+ if p.get('Location'):
545
+ text_parts.append(f"Location: {p.get('Location')}")
546
+
547
+ # Add contact and social media information
548
+ if p.get('LinkedIn'):
549
+ text_parts.append(f"LinkedIn: {p.get('LinkedIn')}")
550
+
551
+ if p.get('Twitter'):
552
+ text_parts.append(f"Twitter: {p.get('Twitter')}")
553
+
554
+ if p.get('Website'):
555
+ text_parts.append(f"Website: {p.get('Website')}")
556
+
557
+ if p.get('Email'):
558
+ text_parts.append(f"Email: {p.get('Email')}")
559
+
560
+ # Add detailed professional information
561
+ if p.get('About'):
562
+ text_parts.append(f"About: {p.get('About')}")
563
+
564
+ if p.get('Skills'):
565
+ text_parts.append(f"Skills: {p.get('Skills')}")
566
+
567
+ if p.get('Experience'):
568
+ text_parts.append(f"Experience: {p.get('Experience')}")
569
+
570
+ if p.get('Education'):
571
+ text_parts.append(f"Education: {p.get('Education')}")
572
+
573
+ # Add any industry or sector information
574
+ if p.get('Industry'):
575
+ text_parts.append(f"Industry: {p.get('Industry')}")
576
+
577
+ if p.get('Sector'):
578
+ text_parts.append(f"Sector: {p.get('Sector')}")
579
+
580
+ # Add any entrepreneurial information
581
+ if p.get('Previous Startups'):
582
+ text_parts.append(f"Previous Startups: {p.get('Previous Startups')}")
583
+
584
+ if p.get('Funding History'):
585
+ text_parts.append(f"Funding History: {p.get('Funding History')}")
586
+
587
+ # Add any additional fields that might be in the CSV
588
+ for key, value in p.items():
589
+ if (key not in ['Full Name', 'Current Position', 'Company', 'Location',
590
+ 'LinkedIn', 'Twitter', 'Website', 'Email',
591
+ 'About', 'Skills', 'Experience', 'Education',
592
+ 'Industry', 'Sector', 'Previous Startups', 'Funding History']
593
+ and value and str(value).lower() != 'nan'):
594
+ text_parts.append(f"{key}: {value}")
595
+
596
+ # Join all parts with newlines for better separation
597
+ text = "\n".join(text_parts)
598
+ texts.append(text)
599
+
600
+ # Log the first few profiles to help with debugging
601
+ if len(texts) <= 3:
602
+ print(f"Profile {len(texts)} text representation:\n{text}\n")
603
+
604
+ # Generate embeddings
605
+ embeddings = self.embeddings.embed_documents(texts)
606
+
607
+ # Store in vector database
608
+ self.vector_store.upsert_profiles(embeddings, profiles)
609
+
610
+ return len(profiles)
611
+
612
+ @async_timeout(300) # 5 minute overall timeout for the entire process
613
+ async def process_message(self, query: str):
614
+ """Process a user message through the workflow."""
615
+ # Reset progress message
616
+ self.progress_message = None
617
+
618
+ # Initialize the state
619
+ state = FounderAnalysisState(
620
+ query=query,
621
+ query_type="",
622
+ filter_key="",
623
+ filter_value="",
624
+ retrieved_profiles=[],
625
+ web_search_results=[],
626
+ analysis_results=[],
627
+ final_response={},
628
+ error=""
629
+ )
630
+
631
+ try:
632
+ # Manually execute the workflow nodes in sequence
633
+ # First process the query
634
+ state = await self.process_query(state)
635
+
636
+ # Route based on query type
637
+ next_node = self.query_router(state)
638
+
639
+ if next_node == "error":
640
+ await cl.Message(content=f"❌ {state['error']}").send()
641
+ return
642
+
643
+ # Execute the appropriate search method
644
+ if next_node == "search":
645
+ state = await self.vector_search(state)
646
+ elif next_node == "filter":
647
+ state = await self.filter_by_metadata(state)
648
+
649
+ # Check for errors after search
650
+ if state.get("error"):
651
+ await cl.Message(content=f"❌ {state['error']}").send()
652
+ return
653
+
654
+ # Continue with the rest of the workflow
655
+ state = await self.web_search(state)
656
+ state = await self.analyze_profiles(state)
657
+ state = await self.format_response(state)
658
+
659
+ except asyncio.TimeoutError:
660
+ await cl.Message(content="❌ The operation timed out. Please try a simpler query or try again later.").send()
661
+ except Exception as e:
662
+ await cl.Message(content=f"❌ Error processing request: {str(e)}").send()
663
+
664
+ # Initialize the system
665
+ system = FounderAnalysisSystem()
666
+
667
+ @cl.on_chat_start
668
+ async def start():
669
+ """Initialize the chat session and prompt for CSV upload."""
670
+ await cl.Message(
671
+ content="πŸ‘‹ Welcome to the Founder Analysis System! Please upload your CSV file with founder profiles."
672
+ ).send()
673
+
674
+ files = await cl.AskFileMessage(
675
+ content="Please upload your CSV file",
676
+ accept=["text/csv"],
677
+ max_size_mb=10
678
+ ).send()
679
+
680
+ if not files:
681
+ await cl.Message(
682
+ content="No file was uploaded. Please try again."
683
+ ).send()
684
+ return
685
+
686
+ file = files[0]
687
+
688
+ # Show loading message
689
+ msg = cl.Message(content=f"⏳ Processing {file.name}...")
690
+ await msg.send()
691
+
692
+ try:
693
+ # Load the profiles with timeout
694
+ num_profiles = await asyncio.wait_for(system.load_profiles(file.path), timeout=120)
695
+
696
+ await cl.Message(
697
+ content=f"βœ… Successfully loaded {num_profiles} founder profiles!\n\n" +
698
+ "You can now:\n\n" +
699
+ "1. **Search for founders by expertise**:\n" +
700
+ " Example: `AI experts in healthcare`\n\n" +
701
+ "2. **Filter by specific fields**:\n" +
702
+ " Example: `filter:Location:San Francisco`\n" +
703
+ " Example: `filter:Skills:Machine Learning`\n\n" +
704
+ "3. **Search across all fields**:\n" +
705
+ " Example: `filter:Stanford`\n" +
706
+ " Example: `filter blockchain`\n\n" +
707
+ "4. **Get founder recommendations**:\n" +
708
+ " Example: `recommend founders for fintech startup`"
709
+ ).send()
710
+ except asyncio.TimeoutError:
711
+ await cl.Message(content="❌ Loading profiles timed out. The CSV file might be too large or complex.").send()
712
+ except Exception as e:
713
+ await cl.Message(content=f"❌ Error loading profiles: {str(e)}").send()
714
+
715
+ @cl.on_message
716
+ async def main(message: cl.Message):
717
+ """Handle user messages and provide responses."""
718
+ await system.process_message(message.content)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chainlit>=0.7.700
2
+ langchain>=0.1.0
3
+ langchain-openai>=0.0.5
4
+ langgraph>=0.0.19
5
+ langchain-experimental>=0.0.49
6
+ qdrant-client>=1.7.0
7
+ pandas>=2.0.0
8
+ python-dotenv>=1.0.0
9
+ tavily-python>=0.3.0
10
+ openai>=1.12.0
11
+ pydantic>=2.0.0
12
+ numpy>=1.24.0
13
+ tqdm>=4.65.0
14
+ typing-extensions>=4.5.0
15
+ aiohttp>=3.9.0
16
+ langchain-core>=0.1.0