Upload 71 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- agents/__init__.py +0 -0
- agents/__pycache__/__init__.cpython-311.pyc +0 -0
- agents/__pycache__/__init__.cpython-312.pyc +0 -0
- agents/__pycache__/agent_decision.cpython-311.pyc +0 -0
- agents/__pycache__/shared_rag.cpython-311.pyc +0 -0
- agents/__pycache__/shared_rag.cpython-312.pyc +0 -0
- agents/agent_decision.py +336 -0
- agents/rag_agent/UNUSED_evaluation.py +177 -0
- agents/rag_agent/__init__.py +340 -0
- agents/rag_agent/__pycache__/__init__.cpython-311.pyc +0 -0
- agents/rag_agent/__pycache__/data_ingestion.cpython-311.pyc +0 -0
- agents/rag_agent/__pycache__/document_processor.cpython-311.pyc +0 -0
- agents/rag_agent/__pycache__/query_processor.cpython-311.pyc +0 -0
- agents/rag_agent/__pycache__/reranker.cpython-311.pyc +0 -0
- agents/rag_agent/__pycache__/response_generator.cpython-311.pyc +0 -0
- agents/rag_agent/__pycache__/vector_store.cpython-311.pyc +0 -0
- agents/rag_agent/data_ingestion.py +300 -0
- agents/rag_agent/document_processor.py +507 -0
- agents/rag_agent/query_processor.py +142 -0
- agents/rag_agent/reranker.py +69 -0
- agents/rag_agent/response_generator.py +326 -0
- agents/rag_agent/vector_store.py +298 -0
- agents/shared_rag.py +151 -0
- agents/web_search_processor_agent/__init__.py +25 -0
- agents/web_search_processor_agent/__pycache__/__init__.cpython-311.pyc +0 -0
- agents/web_search_processor_agent/__pycache__/geminai_search.cpython-311.pyc +0 -0
- agents/web_search_processor_agent/__pycache__/pubmed_search.cpython-311.pyc +0 -0
- agents/web_search_processor_agent/__pycache__/tavily_search.cpython-311.pyc +0 -0
- agents/web_search_processor_agent/__pycache__/web_search_agent.cpython-311.pyc +0 -0
- agents/web_search_processor_agent/__pycache__/web_search_processor.cpython-311.pyc +0 -0
- agents/web_search_processor_agent/geminai_search.py +162 -0
- agents/web_search_processor_agent/web_search_agent.py +125 -0
- agents/web_search_processor_agent/web_search_processor.py +116 -0
- chat_sessions/chat_Harsh Narayan_20250802_165208.json +24 -0
- chat_sessions/chat_Harsh Narayan_20250804_163618.json +27 -0
- chat_sessions/chat_string_string.json +8 -0
- config/__init__.py +0 -0
- config/__pycache__/__init__.cpython-311.pyc +0 -0
- config/__pycache__/config.cpython-311.pyc +0 -0
- config/agents.yaml +42 -0
- config/config.py +164 -0
- config/rag.yaml +26 -0
- config/tasks.yaml +79 -0
- crew_ai/README.md +164 -0
- crew_ai/__init__.py +0 -0
- crew_ai/__pycache__/__init__.cpython-311.pyc +0 -0
- crew_ai/__pycache__/chatbot.cpython-311.pyc +0 -0
- crew_ai/__pycache__/config.cpython-311.pyc +0 -0
- crew_ai/__pycache__/data_retriever_util.cpython-311.pyc +0 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
static/treeman.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
static/treeman.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
knowledge/deathless.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
knowledge/know-see.pdf filter=lfs diff=lfs merge=lfs -text
|
agents/__init__.py
ADDED
|
File without changes
|
agents/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (154 Bytes). View file
|
|
|
agents/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (142 Bytes). View file
|
|
|
agents/__pycache__/agent_decision.cpython-311.pyc
ADDED
|
Binary file (15.7 kB). View file
|
|
|
agents/__pycache__/shared_rag.cpython-311.pyc
ADDED
|
Binary file (9.25 kB). View file
|
|
|
agents/__pycache__/shared_rag.cpython-312.pyc
ADDED
|
Binary file (8.42 kB). View file
|
|
|
agents/agent_decision.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Decision System for Multi-Agent Mental Health Chatbot
|
| 3 |
+
Orchestrates RAG, CrewAI, and Web Search agents using LangGraph.
|
| 4 |
+
"""
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
import json
|
| 7 |
+
from typing import TypedDict, List, Any, Optional, Union, Dict
|
| 8 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 9 |
+
from langgraph.graph import StateGraph, END
|
| 10 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 11 |
+
from difflib import SequenceMatcher
|
| 12 |
+
|
| 13 |
+
from agents.web_search_processor_agent import WebSearchProcessorAgent
|
| 14 |
+
from config.config import Config
|
| 15 |
+
from .shared_rag import shared_rag_instance
|
| 16 |
+
|
| 17 |
+
# Import CrewAI components
|
| 18 |
+
try:
|
| 19 |
+
from crew_ai.chatbot import (
|
| 20 |
+
run_crisis_check,
|
| 21 |
+
run_condition_classification,
|
| 22 |
+
run_user_profile_retrieval,
|
| 23 |
+
run_recommendations
|
| 24 |
+
)
|
| 25 |
+
CREWAI_AVAILABLE = True
|
| 26 |
+
print("✅ CrewAI components imported successfully")
|
| 27 |
+
except ImportError as e:
|
| 28 |
+
print(f"⚠️ CrewAI components not available: {e}")
|
| 29 |
+
CREWAI_AVAILABLE = False
|
| 30 |
+
|
| 31 |
+
load_dotenv()
|
| 32 |
+
|
| 33 |
+
# Configuration
|
| 34 |
+
config = Config()
|
| 35 |
+
memory = MemorySaver()
|
| 36 |
+
|
| 37 |
+
# Simple query cache
|
| 38 |
+
QUERY_CACHE = {}
|
| 39 |
+
CACHE_SIMILARITY_THRESHOLD = 0.85
|
| 40 |
+
|
| 41 |
+
class AgentState(TypedDict):
|
| 42 |
+
"""State maintained across the workflow."""
|
| 43 |
+
messages: List[Any]
|
| 44 |
+
agent_name: Optional[str]
|
| 45 |
+
current_input: Optional[Union[str, Dict]]
|
| 46 |
+
output: Optional[str]
|
| 47 |
+
retrieval_confidence: float
|
| 48 |
+
user_context: Optional[Dict]
|
| 49 |
+
# CrewAI specific state
|
| 50 |
+
crisis_detected: Optional[bool]
|
| 51 |
+
condition_classified: Optional[str]
|
| 52 |
+
user_profile: Optional[Dict]
|
| 53 |
+
crewai_used: Optional[bool]
|
| 54 |
+
|
| 55 |
+
def get_cached_response(query: str, user_context: dict) -> Optional[dict]:
|
| 56 |
+
"""Check for cached similar response."""
|
| 57 |
+
query_lower = query.lower()
|
| 58 |
+
|
| 59 |
+
for cached_query, cached_data in QUERY_CACHE.items():
|
| 60 |
+
similarity = SequenceMatcher(None, query_lower, cached_query.lower()).ratio()
|
| 61 |
+
if similarity >= CACHE_SIMILARITY_THRESHOLD:
|
| 62 |
+
if (cached_data['emotion'] == user_context.get('emotion') and
|
| 63 |
+
cached_data['status'] == user_context.get('mental_health_status')):
|
| 64 |
+
print(f"[CACHE] Found similar response (similarity: {similarity:.2f})")
|
| 65 |
+
return cached_data['response']
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
def cache_response(query: str, user_context: dict, response: dict):
|
| 69 |
+
"""Cache response for future use."""
|
| 70 |
+
if len(QUERY_CACHE) > 100:
|
| 71 |
+
oldest_key = next(iter(QUERY_CACHE))
|
| 72 |
+
del QUERY_CACHE[oldest_key]
|
| 73 |
+
|
| 74 |
+
QUERY_CACHE[query] = {
|
| 75 |
+
'emotion': user_context.get('emotion'),
|
| 76 |
+
'status': user_context.get('mental_health_status'),
|
| 77 |
+
'response': response
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
def create_agent_graph():
|
| 81 |
+
"""Create and configure the LangGraph for agent orchestration."""
|
| 82 |
+
|
| 83 |
+
def rag_agent_processor(state):
|
| 84 |
+
"""Process query using RAG pipeline"""
|
| 85 |
+
try:
|
| 86 |
+
user_query = state["current_input"]
|
| 87 |
+
user_context = state.get("user_context", {})
|
| 88 |
+
|
| 89 |
+
print(f"[RAG_AGENT] Processing: {user_query[:50]}...")
|
| 90 |
+
print(f"[RAG_AGENT] Context - Emotion: {user_context.get('emotion', 'neutral')}, Status: {user_context.get('mental_health_status', 'Unknown')}")
|
| 91 |
+
|
| 92 |
+
# Check cache first
|
| 93 |
+
cached_response = get_cached_response(user_query, user_context)
|
| 94 |
+
if cached_response:
|
| 95 |
+
print("[RAG_AGENT] Using cached response")
|
| 96 |
+
return {
|
| 97 |
+
**state,
|
| 98 |
+
"output": cached_response.get('response', ''),
|
| 99 |
+
"agent_name": "RAG_AGENT_CACHED",
|
| 100 |
+
"retrieval_confidence": cached_response.get('confidence', 0.8),
|
| 101 |
+
"crewai_used": False
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
# Get RAG instance and process query
|
| 105 |
+
rag = shared_rag_instance.get_rag()
|
| 106 |
+
result = rag.process_query(
|
| 107 |
+
query=user_query,
|
| 108 |
+
user_emotion=user_context.get('emotion', 'neutral'),
|
| 109 |
+
mental_health_status=user_context.get('mental_health_status', 'Unknown'),
|
| 110 |
+
user_context=user_context
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
confidence = result.get("confidence", 0.0)
|
| 114 |
+
response_text = result.get("response", "I'm here to help you with your mental health concerns.")
|
| 115 |
+
|
| 116 |
+
print(f"[RAG_AGENT] Confidence: {confidence}")
|
| 117 |
+
print(f"[RAG_AGENT] Response: {response_text[:100]}...")
|
| 118 |
+
|
| 119 |
+
# Cache the response
|
| 120 |
+
cache_response(user_query, user_context, result)
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
**state,
|
| 124 |
+
"output": response_text,
|
| 125 |
+
"agent_name": "RAG_AGENT",
|
| 126 |
+
"retrieval_confidence": confidence,
|
| 127 |
+
"crewai_used": False
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
print(f"[RAG_AGENT] Error: {e}")
|
| 132 |
+
return {
|
| 133 |
+
**state,
|
| 134 |
+
"output": "I understand you're reaching out for support. While I'm having some technical difficulties, I want you to know that your feelings are valid and there are people who can help.",
|
| 135 |
+
"agent_name": "RAG_AGENT_FALLBACK",
|
| 136 |
+
"retrieval_confidence": 0.0,
|
| 137 |
+
"crewai_used": False
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
def crewai_pipeline_processor(state: AgentState) -> AgentState:
|
| 141 |
+
"""Process query using CrewAI pipeline when RAG confidence is low"""
|
| 142 |
+
print("[CREWAI_PIPELINE] Processing with CrewAI agents...")
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
if not CREWAI_AVAILABLE:
|
| 146 |
+
print("[CREWAI_PIPELINE] Not available, falling back to web search")
|
| 147 |
+
return fallback_to_web_search(state)
|
| 148 |
+
|
| 149 |
+
user_query = str(state["current_input"])
|
| 150 |
+
user_context = state.get("user_context", {})
|
| 151 |
+
|
| 152 |
+
print(f"[CREWAI_PIPELINE] Query: {user_query[:50]}...")
|
| 153 |
+
|
| 154 |
+
# Use RAG's CrewAI integration if available
|
| 155 |
+
rag = shared_rag_instance.get_rag()
|
| 156 |
+
if hasattr(rag, 'process_query_with_crewai') and rag.crewai_enabled:
|
| 157 |
+
print("[CREWAI_PIPELINE] Using RAG's CrewAI integration")
|
| 158 |
+
|
| 159 |
+
result = rag.process_query_with_crewai(user_query, user_context)
|
| 160 |
+
|
| 161 |
+
return {
|
| 162 |
+
**state,
|
| 163 |
+
"output": result.get("response", "I'm here to support you."),
|
| 164 |
+
"agent_name": result.get("agent", "CREWAI_ENHANCED_SYSTEM"),
|
| 165 |
+
"crisis_detected": result.get("is_crisis", False),
|
| 166 |
+
"condition_classified": result.get("condition", "general"),
|
| 167 |
+
"crewai_used": True,
|
| 168 |
+
"retrieval_confidence": result.get("confidence", 0.85)
|
| 169 |
+
}
|
| 170 |
+
else:
|
| 171 |
+
print("[CREWAI_PIPELINE] Using direct CrewAI functions")
|
| 172 |
+
|
| 173 |
+
# Direct CrewAI processing
|
| 174 |
+
user_id = user_context.get('user_id', 'anon_user')
|
| 175 |
+
|
| 176 |
+
# Crisis check
|
| 177 |
+
crisis_result = run_crisis_check(user_query)
|
| 178 |
+
is_crisis = crisis_result.get("is_crisis", False)
|
| 179 |
+
|
| 180 |
+
if is_crisis:
|
| 181 |
+
crisis_rec = run_recommendations(
|
| 182 |
+
user_query,
|
| 183 |
+
user_profile=json.dumps(user_context),
|
| 184 |
+
condition="Crisis",
|
| 185 |
+
answers="{}",
|
| 186 |
+
interpretation="N/A",
|
| 187 |
+
is_crisis="true"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
return {
|
| 191 |
+
**state,
|
| 192 |
+
"output": crisis_rec.get("recommendation",
|
| 193 |
+
"🆘 Please reach out for immediate help. In Bhutan: Emergency Services (112), National Mental Health Program (1717)"),
|
| 194 |
+
"agent_name": "CREWAI_CRISIS_AGENT",
|
| 195 |
+
"crisis_detected": True,
|
| 196 |
+
"crewai_used": True,
|
| 197 |
+
"retrieval_confidence": 0.95
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
# Normal processing
|
| 201 |
+
try:
|
| 202 |
+
user_profile = run_user_profile_retrieval(user_query, user_id)
|
| 203 |
+
except:
|
| 204 |
+
user_profile = {"id": user_id, "name": user_context.get('name', 'User')}
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
condition_result = run_condition_classification(user_query, json.dumps(user_profile))
|
| 208 |
+
condition = condition_result.get("condition", "general").lower()
|
| 209 |
+
except:
|
| 210 |
+
condition = "general"
|
| 211 |
+
|
| 212 |
+
final_rec = run_recommendations(
|
| 213 |
+
user_query,
|
| 214 |
+
json.dumps(user_profile),
|
| 215 |
+
condition,
|
| 216 |
+
json.dumps(user_context.get('assessment_answers', {})),
|
| 217 |
+
user_context.get('mental_health_status', 'Unknown'),
|
| 218 |
+
is_crisis="false"
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
**state,
|
| 223 |
+
"output": final_rec.get("recommendation",
|
| 224 |
+
f"Thank you for sharing your concerns. I'm here to support you with {condition} related issues."),
|
| 225 |
+
"agent_name": "CREWAI_ENHANCED_SYSTEM",
|
| 226 |
+
"condition_classified": condition,
|
| 227 |
+
"user_profile": user_profile,
|
| 228 |
+
"crewai_used": True,
|
| 229 |
+
"retrieval_confidence": 0.85
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(f"[CREWAI_PIPELINE] Error: {e}")
|
| 234 |
+
return fallback_to_web_search(state)
|
| 235 |
+
|
| 236 |
+
def fallback_to_web_search(state: AgentState) -> AgentState:
|
| 237 |
+
"""Fallback to web search processor"""
|
| 238 |
+
print("[WEB_SEARCH] Processing with web search agent...")
|
| 239 |
+
|
| 240 |
+
try:
|
| 241 |
+
query = str(state["current_input"])
|
| 242 |
+
user_context = state.get("user_context", {})
|
| 243 |
+
|
| 244 |
+
# Use WebSearchProcessorAgent
|
| 245 |
+
web_agent = WebSearchProcessorAgent()
|
| 246 |
+
response = web_agent.process_web_search_results(
|
| 247 |
+
query=query,
|
| 248 |
+
user_context=user_context
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
return {
|
| 252 |
+
**state,
|
| 253 |
+
"output": response,
|
| 254 |
+
"agent_name": "WEB_SEARCH_PROCESSOR_AGENT",
|
| 255 |
+
"crewai_used": False
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
except Exception as e:
|
| 259 |
+
print(f"[WEB_SEARCH] Error: {e}")
|
| 260 |
+
return {
|
| 261 |
+
**state,
|
| 262 |
+
"output": "I'm here to support you, though I'm having some technical difficulties. Please know that help is available. For immediate support in Bhutan, contact the National Mental Health Program at 1717.",
|
| 263 |
+
"agent_name": "WEB_SEARCH_FALLBACK",
|
| 264 |
+
"crewai_used": False
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
def confidence_based_routing(state: AgentState) -> str:
|
| 268 |
+
"""Route based on RAG confidence score."""
|
| 269 |
+
min_confidence = getattr(config.rag, 'min_retrieval_confidence', 0.7)
|
| 270 |
+
confidence = state.get("retrieval_confidence", 0.0)
|
| 271 |
+
|
| 272 |
+
print(f"[ROUTING] Confidence: {confidence:.2f}, Threshold: {min_confidence}")
|
| 273 |
+
|
| 274 |
+
if confidence < min_confidence:
|
| 275 |
+
if CREWAI_AVAILABLE:
|
| 276 |
+
print(f"[ROUTING] Low confidence, routing to CrewAI...")
|
| 277 |
+
return "CREWAI_PIPELINE"
|
| 278 |
+
else:
|
| 279 |
+
print(f"[ROUTING] Low confidence, routing to Web Search...")
|
| 280 |
+
return "WEB_SEARCH_PROCESSOR_AGENT"
|
| 281 |
+
|
| 282 |
+
print(f"[ROUTING] High confidence, finalizing...")
|
| 283 |
+
return "finalize_response"
|
| 284 |
+
|
| 285 |
+
def finalize_response(state: AgentState) -> AgentState:
|
| 286 |
+
"""Finalize the response."""
|
| 287 |
+
output = state.get("output", "")
|
| 288 |
+
|
| 289 |
+
if output:
|
| 290 |
+
messages = state.get("messages", [])
|
| 291 |
+
messages.append(AIMessage(content=str(output)))
|
| 292 |
+
|
| 293 |
+
agent_name = state.get("agent_name", "Unknown")
|
| 294 |
+
crewai_used = state.get("crewai_used", False)
|
| 295 |
+
|
| 296 |
+
print(f"[FINALIZE] Response from {agent_name}, CrewAI: {crewai_used}")
|
| 297 |
+
|
| 298 |
+
return {
|
| 299 |
+
**state,
|
| 300 |
+
"messages": messages
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
return state
|
| 304 |
+
|
| 305 |
+
# Create workflow graph
|
| 306 |
+
workflow = StateGraph(AgentState)
|
| 307 |
+
|
| 308 |
+
# Add nodes
|
| 309 |
+
workflow.add_node("RAG_AGENT", rag_agent_processor)
|
| 310 |
+
workflow.add_node("CREWAI_PIPELINE", crewai_pipeline_processor)
|
| 311 |
+
workflow.add_node("WEB_SEARCH_PROCESSOR_AGENT", fallback_to_web_search)
|
| 312 |
+
workflow.add_node("finalize_response", finalize_response)
|
| 313 |
+
|
| 314 |
+
# Define edges
|
| 315 |
+
workflow.set_entry_point("RAG_AGENT")
|
| 316 |
+
workflow.add_conditional_edges("RAG_AGENT", confidence_based_routing)
|
| 317 |
+
workflow.add_edge("CREWAI_PIPELINE", "finalize_response")
|
| 318 |
+
workflow.add_edge("WEB_SEARCH_PROCESSOR_AGENT", "finalize_response")
|
| 319 |
+
workflow.add_edge("finalize_response", END)
|
| 320 |
+
|
| 321 |
+
return workflow.compile(checkpointer=memory)
|
| 322 |
+
|
| 323 |
+
def init_agent_state() -> AgentState:
|
| 324 |
+
"""Initialize agent state with default values."""
|
| 325 |
+
return {
|
| 326 |
+
"messages": [],
|
| 327 |
+
"agent_name": None,
|
| 328 |
+
"current_input": None,
|
| 329 |
+
"output": None,
|
| 330 |
+
"retrieval_confidence": 0.0,
|
| 331 |
+
"user_context": None,
|
| 332 |
+
"crisis_detected": None,
|
| 333 |
+
"condition_classified": None,
|
| 334 |
+
"user_profile": None,
|
| 335 |
+
"crewai_used": None
|
| 336 |
+
}
|
agents/rag_agent/UNUSED_evaluation.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
+
import re
|
| 4 |
+
import json
|
| 5 |
+
from collections import Counter
|
| 6 |
+
|
| 7 |
+
class RAGEvaluator:
|
| 8 |
+
"""
|
| 9 |
+
Evaluates the performance of the RAG system and tracks metrics.
|
| 10 |
+
"""
|
| 11 |
+
def __init__(self, config):
|
| 12 |
+
"""
|
| 13 |
+
Initialize the evaluator.
|
| 14 |
+
Args:
|
| 15 |
+
config: Configuration object
|
| 16 |
+
"""
|
| 17 |
+
self.logger = logging.getLogger(__name__)
|
| 18 |
+
self.metrics = {
|
| 19 |
+
"queries_processed": 0,
|
| 20 |
+
"successful_retrievals": 0,
|
| 21 |
+
"failed_retrievals": 0,
|
| 22 |
+
"avg_retrieval_time": 0,
|
| 23 |
+
"avg_response_time": 0,
|
| 24 |
+
"avg_confidence_score": 0,
|
| 25 |
+
"feedback_scores": []
|
| 26 |
+
}
|
| 27 |
+
self.save_path = getattr(config.rag, "metrics_save_path", "rag_metrics.json")
|
| 28 |
+
|
| 29 |
+
def log_retrieval(self, query: str, retrieved_docs: List[Dict[str, Any]],
|
| 30 |
+
retrieval_time: float, success: bool = True):
|
| 31 |
+
"""
|
| 32 |
+
Log metrics for a retrieval operation.
|
| 33 |
+
"""
|
| 34 |
+
self.metrics["queries_processed"] += 1
|
| 35 |
+
|
| 36 |
+
if success and retrieved_docs:
|
| 37 |
+
self.metrics["successful_retrievals"] += 1
|
| 38 |
+
prev_avg = self.metrics["avg_retrieval_time"]
|
| 39 |
+
prev_count = self.metrics["queries_processed"] - 1
|
| 40 |
+
self.metrics["avg_retrieval_time"] = (prev_avg * prev_count + retrieval_time) / self.metrics["queries_processed"]
|
| 41 |
+
scores = [doc.get("score", 0) for doc in retrieved_docs]
|
| 42 |
+
avg_score = sum(scores) / len(scores) if scores else 0
|
| 43 |
+
self.logger.info(f"Query: '{query}' | Docs: {len(retrieved_docs)} | Avg Score: {avg_score:.4f}")
|
| 44 |
+
else:
|
| 45 |
+
self.metrics["failed_retrievals"] += 1
|
| 46 |
+
self.logger.warning(f"Failed retrieval for query: '{query}'")
|
| 47 |
+
|
| 48 |
+
def log_response(self, query: str, response: Dict[str, Any], response_time: float):
|
| 49 |
+
"""
|
| 50 |
+
Log metrics for a response generation operation.
|
| 51 |
+
"""
|
| 52 |
+
prev_avg = self.metrics["avg_response_time"]
|
| 53 |
+
prev_count = self.metrics["queries_processed"] - 1
|
| 54 |
+
self.metrics["avg_response_time"] = (prev_avg * prev_count + response_time) / self.metrics["queries_processed"]
|
| 55 |
+
|
| 56 |
+
confidence = response.get("confidence", 0)
|
| 57 |
+
prev_avg = self.metrics["avg_confidence_score"]
|
| 58 |
+
self.metrics["avg_confidence_score"] = (prev_avg * prev_count + confidence) / self.metrics["queries_processed"]
|
| 59 |
+
|
| 60 |
+
self.logger.info(f"Generated response for query: '{query}' | Confidence: {confidence:.4f}")
|
| 61 |
+
|
| 62 |
+
def log_user_feedback(self, query: str, response: Dict[str, Any], feedback_score: int):
|
| 63 |
+
"""
|
| 64 |
+
Log user feedback on responses.
|
| 65 |
+
"""
|
| 66 |
+
self.metrics["feedback_scores"].append({
|
| 67 |
+
"query": query,
|
| 68 |
+
"response": response.get("response", ""),
|
| 69 |
+
"score": feedback_score
|
| 70 |
+
})
|
| 71 |
+
self.logger.info(f"Received feedback for query: '{query}' | Score: {feedback_score}")
|
| 72 |
+
|
| 73 |
+
def evaluate_response_quality(self, query: str, retrieved_docs: List[Dict[str, Any]],
|
| 74 |
+
response: Dict[str, Any]) -> Dict[str, Any]:
|
| 75 |
+
"""
|
| 76 |
+
Evaluate the quality of the response based on retrieved documents.
|
| 77 |
+
"""
|
| 78 |
+
retrieval_precision = self._calculate_precision(query, retrieved_docs)
|
| 79 |
+
answer_relevance = self._calculate_relevance(query, response, retrieved_docs)
|
| 80 |
+
metrics = {
|
| 81 |
+
"retrieval_precision": retrieval_precision,
|
| 82 |
+
"answer_relevance": answer_relevance,
|
| 83 |
+
"hallucination_risk": self._estimate_hallucination_risk(response, retrieved_docs),
|
| 84 |
+
"answer_completeness": self._calculate_completeness(response, retrieved_docs)
|
| 85 |
+
}
|
| 86 |
+
return metrics
|
| 87 |
+
|
| 88 |
+
def _calculate_precision(self, query: str, docs: List[Dict[str, Any]]) -> float:
|
| 89 |
+
"""
|
| 90 |
+
Calculate precision of retrieved documents (simplified).
|
| 91 |
+
"""
|
| 92 |
+
if not docs:
|
| 93 |
+
return 0.0
|
| 94 |
+
scores = [doc.get("score", 0) for doc in docs]
|
| 95 |
+
return sum(scores) / len(scores) if scores else 0.0
|
| 96 |
+
|
| 97 |
+
def _calculate_relevance(self, query: str, response: Dict[str, Any], docs: List[Dict[str, Any]]) -> float:
|
| 98 |
+
"""
|
| 99 |
+
Calculate relevance of the response to the query.
|
| 100 |
+
"""
|
| 101 |
+
if not docs or not response:
|
| 102 |
+
return 0.0
|
| 103 |
+
response_text = response.get("response", "").lower()
|
| 104 |
+
query_words = set(query.lower().split())
|
| 105 |
+
stopwords = {"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "with", "by"}
|
| 106 |
+
query_words = query_words - stopwords
|
| 107 |
+
word_count = sum(1 for word in query_words if word in response_text)
|
| 108 |
+
return word_count / len(query_words) if query_words else 0.0
|
| 109 |
+
|
| 110 |
+
def _estimate_hallucination_risk(self, response: Dict[str, Any], docs: List[Dict[str, Any]]) -> float:
|
| 111 |
+
"""
|
| 112 |
+
Estimate risk of hallucination in the response.
|
| 113 |
+
"""
|
| 114 |
+
if not docs or not response:
|
| 115 |
+
return 1.0
|
| 116 |
+
all_doc_content = " ".join([doc["content"].lower() for doc in docs])
|
| 117 |
+
response_text = response.get("response", "").lower()
|
| 118 |
+
factual_statements = re.findall(r'[^.!?]*[.!?]', response_text)
|
| 119 |
+
unsupported = 0
|
| 120 |
+
total = len(factual_statements)
|
| 121 |
+
for statement in factual_statements:
|
| 122 |
+
has_number = bool(re.search(r'\d+', statement))
|
| 123 |
+
has_medical_term = bool(re.search(r'(?i)(disease|syndrome|treatment|medication|therapy|drug|dosage|diagnosis)', statement))
|
| 124 |
+
if (has_number or has_medical_term) and not self._is_supported(statement, all_doc_content):
|
| 125 |
+
unsupported += 1
|
| 126 |
+
return unsupported / total if total > 0 else 0.5
|
| 127 |
+
|
| 128 |
+
def _is_supported(self, statement: str, doc_content: str) -> bool:
|
| 129 |
+
"""
|
| 130 |
+
Check if a statement is supported by document content.
|
| 131 |
+
"""
|
| 132 |
+
keywords = statement.lower().split()
|
| 133 |
+
keywords = [w for w in keywords if len(w) > 4]
|
| 134 |
+
if not keywords:
|
| 135 |
+
return True
|
| 136 |
+
found = sum(1 for word in keywords if word in doc_content)
|
| 137 |
+
return (found / len(keywords)) >= 0.6
|
| 138 |
+
|
| 139 |
+
def _calculate_completeness(self, response: Dict[str, Any], docs: List[Dict[str, Any]]) -> float:
|
| 140 |
+
"""
|
| 141 |
+
Calculate completeness of the response.
|
| 142 |
+
"""
|
| 143 |
+
response_text = response.get("response", "")
|
| 144 |
+
word_count = len(response_text.split())
|
| 145 |
+
length_score = min(word_count / 150, 1.0)
|
| 146 |
+
has_introduction = bool(re.search(r'^[A-Z][^.!?]{10,}[.!?]', response_text))
|
| 147 |
+
has_conclusion = bool(re.search(r'(?i)(in conclusion|to summarize|overall|in summary)', response_text))
|
| 148 |
+
structure_score = (has_introduction + has_conclusion) / 2
|
| 149 |
+
return (length_score * 0.7) + (structure_score * 0.3)
|
| 150 |
+
|
| 151 |
+
def save_metrics(self):
|
| 152 |
+
"""Save current metrics to disk."""
|
| 153 |
+
try:
|
| 154 |
+
with open(self.save_path, 'w') as f:
|
| 155 |
+
json.dump(self.metrics, f, indent=2)
|
| 156 |
+
self.logger.info(f"Metrics saved to {self.save_path}")
|
| 157 |
+
except Exception as e:
|
| 158 |
+
self.logger.error(f"Failed to save metrics: {e}")
|
| 159 |
+
|
| 160 |
+
def get_metrics_summary(self) -> Dict[str, Any]:
|
| 161 |
+
"""
|
| 162 |
+
Get a summary of current metrics.
|
| 163 |
+
"""
|
| 164 |
+
retrieval_success_rate = 0
|
| 165 |
+
if self.metrics["queries_processed"] > 0:
|
| 166 |
+
retrieval_success_rate = self.metrics["successful_retrievals"] / self.metrics["queries_processed"]
|
| 167 |
+
feedback_distribution = Counter(item["score"] for item in self.metrics["feedback_scores"])
|
| 168 |
+
avg_feedback = sum(item["score"] for item in self.metrics["feedback_scores"]) / len(self.metrics["feedback_scores"]) if self.metrics["feedback_scores"] else 0
|
| 169 |
+
return {
|
| 170 |
+
"queries_processed": self.metrics["queries_processed"],
|
| 171 |
+
"retrieval_success_rate": retrieval_success_rate,
|
| 172 |
+
"avg_retrieval_time": self.metrics["avg_retrieval_time"],
|
| 173 |
+
"avg_response_time": self.metrics["avg_response_time"],
|
| 174 |
+
"avg_confidence_score": self.metrics["avg_confidence_score"],
|
| 175 |
+
"feedback_distribution": feedback_distribution,
|
| 176 |
+
"avg_feedback_score": avg_feedback
|
| 177 |
+
}
|
agents/rag_agent/__init__.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Dict, Any, List, Optional
|
| 3 |
+
from sentence_transformers import SentenceTransformer
|
| 4 |
+
import logging
|
| 5 |
+
from config.config import Config
|
| 6 |
+
|
| 7 |
+
from .vector_store import QdrantRetriever
|
| 8 |
+
from .document_processor import MedicalDocumentProcessor
|
| 9 |
+
from .query_processor import QueryProcessor
|
| 10 |
+
from .reranker import Reranker
|
| 11 |
+
from .response_generator import ResponseGenerator
|
| 12 |
+
from .data_ingestion import MedicalDataIngestion
|
| 13 |
+
import json
|
| 14 |
+
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
load_dotenv()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class MedicalRAG:
|
| 20 |
+
"""
|
| 21 |
+
Medical Retrieval-Augmented Generation system that integrates all components.
|
| 22 |
+
"""
|
| 23 |
+
def __init__(self, config: Config, llm, embedding_model):
|
| 24 |
+
self.logger = logging.getLogger(__name__)
|
| 25 |
+
self.logger.setLevel(logging.INFO)
|
| 26 |
+
|
| 27 |
+
self.config = config
|
| 28 |
+
self.llm = llm
|
| 29 |
+
self.embedding_model = embedding_model or SentenceTransformer("all-MiniLM-L6-v2")
|
| 30 |
+
|
| 31 |
+
if not self.embedding_model:
|
| 32 |
+
raise ValueError("Embedding model is required for document processing")
|
| 33 |
+
|
| 34 |
+
# Get chunking parameters from config
|
| 35 |
+
self.chunk_size = getattr(config.rag, 'chunk_size', 256)
|
| 36 |
+
self.chunk_overlap = getattr(config.rag, 'chunk_overlap', 50)
|
| 37 |
+
self.chunking_strategy = getattr(config.rag, 'chunking_strategy', 'hybrid')
|
| 38 |
+
|
| 39 |
+
# Ensure processed docs directory exists
|
| 40 |
+
self.processed_docs_dir = Path(getattr(config.rag, 'processed_docs_dir', 'processed_docs'))
|
| 41 |
+
self.processed_docs_dir.mkdir(exist_ok=True)
|
| 42 |
+
|
| 43 |
+
# Initialize CrewAI integration
|
| 44 |
+
self.crewai_enabled = True
|
| 45 |
+
try:
|
| 46 |
+
self._initialize_crewai_agents()
|
| 47 |
+
self.logger.info("✅ CrewAI agents initialized successfully")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
self.logger.warning(f"⚠️ CrewAI agents not available: {e}")
|
| 50 |
+
self.crewai_enabled = False
|
| 51 |
+
|
| 52 |
+
# Initialize core components
|
| 53 |
+
try:
|
| 54 |
+
self.retriever = QdrantRetriever(config)
|
| 55 |
+
self.document_processor = MedicalDocumentProcessor(config, self.embedding_model)
|
| 56 |
+
self.query_processor = QueryProcessor(config, self.embedding_model)
|
| 57 |
+
self.reranker = Reranker(config)
|
| 58 |
+
self.response_generator = ResponseGenerator(config, llm)
|
| 59 |
+
self.data_ingestion = MedicalDataIngestion()
|
| 60 |
+
|
| 61 |
+
self.logger.info(f"✅ MedicalRAG initialized - Embedding dim: {getattr(config.rag, 'embedding_dim', 'unknown')}")
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
self.logger.error(f"❌ Error initializing MedicalRAG components: {e}")
|
| 65 |
+
raise
|
| 66 |
+
|
| 67 |
+
def _initialize_crewai_agents(self):
|
| 68 |
+
"""Initialize CrewAI agents from crew_ai module"""
|
| 69 |
+
from crew_ai.chatbot import (
|
| 70 |
+
run_crisis_check,
|
| 71 |
+
run_condition_classification,
|
| 72 |
+
run_user_profile_retrieval,
|
| 73 |
+
run_recommendations
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Store CrewAI functions
|
| 77 |
+
self.run_crisis_check = run_crisis_check
|
| 78 |
+
self.run_condition_classification = run_condition_classification
|
| 79 |
+
self.run_user_profile_retrieval = run_user_profile_retrieval
|
| 80 |
+
self.run_recommendations = run_recommendations
|
| 81 |
+
|
| 82 |
+
def ingest_knowledge_folder(self, folder_path: str) -> Dict[str, Any]:
|
| 83 |
+
"""
|
| 84 |
+
Ingest all documents from a knowledge folder.
|
| 85 |
+
"""
|
| 86 |
+
folder = Path(folder_path)
|
| 87 |
+
if not folder.exists():
|
| 88 |
+
self.logger.error(f"Knowledge folder not found: {folder_path}")
|
| 89 |
+
return {"error": f"Folder not found: {folder_path}"}
|
| 90 |
+
|
| 91 |
+
ingestion_results = {
|
| 92 |
+
"total_files": 0,
|
| 93 |
+
"successful": 0,
|
| 94 |
+
"failed": 0,
|
| 95 |
+
"files": [],
|
| 96 |
+
"total_chunks": 0
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Get all supported files
|
| 100 |
+
supported_extensions = ['.txt', '.pdf', '.md', '.json', '.csv']
|
| 101 |
+
files = []
|
| 102 |
+
for ext in supported_extensions:
|
| 103 |
+
files.extend(folder.glob(f'*{ext}'))
|
| 104 |
+
|
| 105 |
+
ingestion_results["total_files"] = len(files)
|
| 106 |
+
|
| 107 |
+
if not files:
|
| 108 |
+
self.logger.warning(f"No supported files found in {folder_path}")
|
| 109 |
+
return ingestion_results
|
| 110 |
+
|
| 111 |
+
self.logger.info(f"Found {len(files)} files to ingest")
|
| 112 |
+
|
| 113 |
+
for file_path in files:
|
| 114 |
+
try:
|
| 115 |
+
self.logger.info(f"Ingesting {file_path.name}...")
|
| 116 |
+
|
| 117 |
+
# Load and process document
|
| 118 |
+
documents = self.data_ingestion.load_document(str(file_path))
|
| 119 |
+
if not documents:
|
| 120 |
+
self.logger.warning(f"No content extracted from {file_path.name}")
|
| 121 |
+
ingestion_results["failed"] += 1
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
# Create chunks
|
| 125 |
+
chunks = self.document_processor.process_documents(documents)
|
| 126 |
+
self.logger.info(f"Created {len(chunks)} chunks from {file_path.name}")
|
| 127 |
+
|
| 128 |
+
# Store in vector database
|
| 129 |
+
stored_count = self.retriever.add_documents(chunks)
|
| 130 |
+
|
| 131 |
+
ingestion_results["successful"] += 1
|
| 132 |
+
ingestion_results["total_chunks"] += len(chunks)
|
| 133 |
+
ingestion_results["files"].append({
|
| 134 |
+
"name": file_path.name,
|
| 135 |
+
"status": "success",
|
| 136 |
+
"chunks": len(chunks),
|
| 137 |
+
"stored": stored_count
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
self.logger.error(f"Error ingesting {file_path.name}: {str(e)}")
|
| 142 |
+
ingestion_results["failed"] += 1
|
| 143 |
+
ingestion_results["files"].append({
|
| 144 |
+
"name": file_path.name,
|
| 145 |
+
"status": "error",
|
| 146 |
+
"error": str(e)
|
| 147 |
+
})
|
| 148 |
+
|
| 149 |
+
self.logger.info(f"Ingestion complete: {ingestion_results['successful']}/{ingestion_results['total_files']} files processed")
|
| 150 |
+
return ingestion_results
|
| 151 |
+
|
| 152 |
+
def process_query(self, query: str, user_emotion: Optional[str] = None,
|
| 153 |
+
mental_health_status: Optional[str] = None,
|
| 154 |
+
user_context: Optional[Dict] = None, **kwargs) -> Dict[str, Any]:
|
| 155 |
+
"""Process a query using the RAG pipeline with emotion and health status awareness."""
|
| 156 |
+
try:
|
| 157 |
+
# Extract and validate parameters
|
| 158 |
+
user_emotion = user_emotion or 'neutral'
|
| 159 |
+
mental_health_status = mental_health_status or 'Unknown'
|
| 160 |
+
user_context = user_context or {}
|
| 161 |
+
|
| 162 |
+
self.logger.info(f"[RAG] Processing query: {query[:50]}...")
|
| 163 |
+
self.logger.info(f"[RAG] User emotion: {user_emotion}, Status: {mental_health_status}")
|
| 164 |
+
|
| 165 |
+
# Process query metadata (NOT recursive call)
|
| 166 |
+
query_metadata = self.query_processor.process_query(query)
|
| 167 |
+
enhanced_query = query_metadata.get('expanded_query', query)
|
| 168 |
+
|
| 169 |
+
print(f"[RAG] Enhanced query: {enhanced_query}")
|
| 170 |
+
|
| 171 |
+
# Generate embedding and retrieve documents
|
| 172 |
+
query_embedding = self.embedding_model.encode(enhanced_query)
|
| 173 |
+
retrieved_docs = self.retriever.retrieve(
|
| 174 |
+
query_embedding,
|
| 175 |
+
top_k=getattr(self.config.rag, 'top_k', 5),
|
| 176 |
+
metadata_filter=query_metadata.get('filters', {})
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
print(f"[RAG] Retrieved {len(retrieved_docs)} documents")
|
| 180 |
+
|
| 181 |
+
# Debug first document
|
| 182 |
+
if retrieved_docs:
|
| 183 |
+
doc = retrieved_docs[0]
|
| 184 |
+
if hasattr(doc, 'page_content'):
|
| 185 |
+
print(f"[RAG] Top doc: {doc.page_content[:100]}...")
|
| 186 |
+
else:
|
| 187 |
+
print(f"[RAG] Top doc: {str(doc)[:100]}...")
|
| 188 |
+
else:
|
| 189 |
+
print("[RAG] ⚠️ No documents retrieved!")
|
| 190 |
+
|
| 191 |
+
# Rerank if enabled
|
| 192 |
+
if self.reranker and retrieved_docs:
|
| 193 |
+
print("[RAG] Reranking documents...")
|
| 194 |
+
reranked_docs = self.reranker.rerank(query, retrieved_docs)
|
| 195 |
+
else:
|
| 196 |
+
reranked_docs = retrieved_docs
|
| 197 |
+
|
| 198 |
+
# Generate response
|
| 199 |
+
response_data = self.response_generator.generate_response(
|
| 200 |
+
query=query,
|
| 201 |
+
retrieved_docs=reranked_docs,
|
| 202 |
+
user_emotion=user_emotion,
|
| 203 |
+
mental_health_status=mental_health_status,
|
| 204 |
+
user_context=user_context
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# Calculate final confidence
|
| 208 |
+
confidence = response_data.get("confidence", 0.5)
|
| 209 |
+
|
| 210 |
+
# Boost confidence for personal emotional queries
|
| 211 |
+
if user_emotion not in ['neutral', 'neutral/unsure'] and any(
|
| 212 |
+
phrase in query.lower() for phrase in ["i am", "i feel", "i'm", "my", "me"]
|
| 213 |
+
):
|
| 214 |
+
confidence_boost = 0.1
|
| 215 |
+
confidence = min(confidence + confidence_boost, 1.0)
|
| 216 |
+
self.logger.info(f"[RAG] Boosted confidence by {confidence_boost} for personal query")
|
| 217 |
+
|
| 218 |
+
response_data["confidence"] = confidence
|
| 219 |
+
|
| 220 |
+
print(f"[RAG] Final confidence: {confidence}")
|
| 221 |
+
print(f"[RAG] Response: {response_data.get('response', '')[:100]}...")
|
| 222 |
+
|
| 223 |
+
return response_data
|
| 224 |
+
|
| 225 |
+
except Exception as e:
|
| 226 |
+
self.logger.error(f"[RAG] Error processing query: {e}")
|
| 227 |
+
import traceback
|
| 228 |
+
traceback.print_exc()
|
| 229 |
+
return self._generate_error_response(str(e))
|
| 230 |
+
|
| 231 |
+
def process_query_with_crewai(self, query: str, user_context: dict = None) -> Dict[str, Any]:
|
| 232 |
+
"""Enhanced query processing using CrewAI agents"""
|
| 233 |
+
try:
|
| 234 |
+
if not self.crewai_enabled:
|
| 235 |
+
self.logger.info("[CrewAI] Not enabled, falling back to regular RAG")
|
| 236 |
+
return self.process_query(
|
| 237 |
+
query,
|
| 238 |
+
user_emotion=user_context.get('emotion', 'neutral'),
|
| 239 |
+
mental_health_status=user_context.get('mental_health_status', 'Unknown'),
|
| 240 |
+
user_context=user_context
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
self.logger.info(f"[CrewAI] Processing with agents: {query[:50]}...")
|
| 244 |
+
|
| 245 |
+
# Step 1: Crisis Detection
|
| 246 |
+
crisis_result = self.run_crisis_check(query)
|
| 247 |
+
is_crisis = crisis_result.get("is_crisis", False)
|
| 248 |
+
|
| 249 |
+
if is_crisis:
|
| 250 |
+
self.logger.warning("[CrewAI] 🚨 Crisis detected")
|
| 251 |
+
crisis_rec = self.run_recommendations(
|
| 252 |
+
query,
|
| 253 |
+
user_profile=json.dumps(user_context or {}),
|
| 254 |
+
condition="Crisis",
|
| 255 |
+
answers="{}",
|
| 256 |
+
interpretation="N/A",
|
| 257 |
+
is_crisis="true"
|
| 258 |
+
)
|
| 259 |
+
return {
|
| 260 |
+
"response": crisis_rec.get("recommendation",
|
| 261 |
+
"🆘 Please contact emergency services immediately: 112 or National Mental Health Program: 1717"),
|
| 262 |
+
"confidence": 0.95,
|
| 263 |
+
"method": "crewai_crisis",
|
| 264 |
+
"agent": "Crisis Detection Agent",
|
| 265 |
+
"is_crisis": True,
|
| 266 |
+
"condition": "crisis",
|
| 267 |
+
"sources": []
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
# Step 2: Get user profile
|
| 271 |
+
user_id = user_context.get('user_id', 'anon_user')
|
| 272 |
+
try:
|
| 273 |
+
user_profile = self.run_user_profile_retrieval(query, user_id)
|
| 274 |
+
except:
|
| 275 |
+
user_profile = {
|
| 276 |
+
"id": user_id,
|
| 277 |
+
"name": user_context.get('name', 'User'),
|
| 278 |
+
"preferences": "General mental health support"
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
# Step 3: Classify condition
|
| 282 |
+
try:
|
| 283 |
+
condition_result = self.run_condition_classification(query, json.dumps(user_profile))
|
| 284 |
+
condition = condition_result.get("condition", "general").lower()
|
| 285 |
+
except:
|
| 286 |
+
condition = "general"
|
| 287 |
+
|
| 288 |
+
self.logger.info(f"[CrewAI] Classified condition: {condition}")
|
| 289 |
+
|
| 290 |
+
# Step 4: Get RAG results for context
|
| 291 |
+
rag_result = self.process_query(
|
| 292 |
+
query,
|
| 293 |
+
user_emotion=user_context.get('emotion', 'neutral'),
|
| 294 |
+
mental_health_status=user_context.get('mental_health_status', 'Unknown'),
|
| 295 |
+
user_context=user_context
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
# Step 5: Generate final recommendation
|
| 299 |
+
final_rec = self.run_recommendations(
|
| 300 |
+
query,
|
| 301 |
+
json.dumps(user_profile),
|
| 302 |
+
condition,
|
| 303 |
+
json.dumps(user_context.get('assessment_answers', {})),
|
| 304 |
+
user_context.get('mental_health_status', 'Unknown'),
|
| 305 |
+
is_crisis="false"
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Combine RAG and CrewAI insights
|
| 309 |
+
combined_response = final_rec.get("recommendation", rag_result.get("response", ""))
|
| 310 |
+
|
| 311 |
+
return {
|
| 312 |
+
"response": combined_response,
|
| 313 |
+
"confidence": max(rag_result.get("confidence", 0.5), 0.85),
|
| 314 |
+
"method": "crewai_enhanced",
|
| 315 |
+
"agent": "CrewAI Enhanced System",
|
| 316 |
+
"sources": rag_result.get("sources", []),
|
| 317 |
+
"condition": condition,
|
| 318 |
+
"is_crisis": False,
|
| 319 |
+
"rag_confidence": rag_result.get("confidence", 0.5)
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
self.logger.error(f"[CrewAI] Processing failed: {e}")
|
| 324 |
+
# Fallback to regular RAG
|
| 325 |
+
return self.process_query(
|
| 326 |
+
query,
|
| 327 |
+
user_emotion=user_context.get('emotion', 'neutral'),
|
| 328 |
+
mental_health_status=user_context.get('mental_health_status', 'Unknown'),
|
| 329 |
+
user_context=user_context
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
def _generate_error_response(self, error_message: str) -> Dict[str, Any]:
|
| 333 |
+
"""Generate a fallback error response when RAG processing fails."""
|
| 334 |
+
return {
|
| 335 |
+
"response": "I apologize, but I'm experiencing some technical difficulties. For immediate mental health support in Bhutan, please contact:\n\n• National Mental Health Program: 1717 (24/7)\n• Emergency Services: 112\n\nYour mental health matters, and help is available.",
|
| 336 |
+
"confidence": 0.0,
|
| 337 |
+
"sources": [],
|
| 338 |
+
"method": "error_fallback",
|
| 339 |
+
"error": error_message
|
| 340 |
+
}
|
agents/rag_agent/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (17.2 kB). View file
|
|
|
agents/rag_agent/__pycache__/data_ingestion.cpython-311.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
agents/rag_agent/__pycache__/document_processor.cpython-311.pyc
ADDED
|
Binary file (27.3 kB). View file
|
|
|
agents/rag_agent/__pycache__/query_processor.cpython-311.pyc
ADDED
|
Binary file (8.58 kB). View file
|
|
|
agents/rag_agent/__pycache__/reranker.cpython-311.pyc
ADDED
|
Binary file (4.32 kB). View file
|
|
|
agents/rag_agent/__pycache__/response_generator.cpython-311.pyc
ADDED
|
Binary file (20.6 kB). View file
|
|
|
agents/rag_agent/__pycache__/vector_store.cpython-311.pyc
ADDED
|
Binary file (15 kB). View file
|
|
|
agents/rag_agent/data_ingestion.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from typing import List, Dict, Any, Optional, Union
|
| 7 |
+
from PyPDF2 import PdfReader
|
| 8 |
+
|
| 9 |
+
from langchain_community.document_loaders import (
|
| 10 |
+
TextLoader,
|
| 11 |
+
PyPDFLoader,
|
| 12 |
+
CSVLoader,
|
| 13 |
+
JSONLoader,
|
| 14 |
+
UnstructuredMarkdownLoader
|
| 15 |
+
)
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
|
| 18 |
+
# Set up logging
|
| 19 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class MedicalDataIngestion:
|
| 24 |
+
"""
|
| 25 |
+
Handles loading of various medical document formats.
|
| 26 |
+
"""
|
| 27 |
+
def __init__(self, config_path: Optional[str] = None):
|
| 28 |
+
"""
|
| 29 |
+
Initialize the data ingestion pipeline.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
config_path: Optional path to configuration file
|
| 33 |
+
"""
|
| 34 |
+
self.logger = logging.getLogger(__name__)
|
| 35 |
+
|
| 36 |
+
# Initialize loaders mapping
|
| 37 |
+
self.loaders = {
|
| 38 |
+
'.txt': self._load_text,
|
| 39 |
+
'.pdf': self._load_pdf,
|
| 40 |
+
'.csv': self._load_csv,
|
| 41 |
+
'.json': self._load_json,
|
| 42 |
+
'.md': self._load_markdown
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
# Initialize stats
|
| 46 |
+
self.stats = {
|
| 47 |
+
"files_processed": 0,
|
| 48 |
+
"documents_ingested": 0,
|
| 49 |
+
"errors": 0
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
logger.info("MedicalDataIngestion initialized")
|
| 53 |
+
|
| 54 |
+
def load_document(self, file_path: str) -> List[Document]:
|
| 55 |
+
"""
|
| 56 |
+
Load a document based on its file type.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
file_path: Path to the document file
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
List of Document objects
|
| 63 |
+
"""
|
| 64 |
+
path = Path(file_path)
|
| 65 |
+
if not path.exists():
|
| 66 |
+
self.logger.error(f"File not found: {file_path}")
|
| 67 |
+
return []
|
| 68 |
+
|
| 69 |
+
suffix = path.suffix.lower()
|
| 70 |
+
if suffix not in self.loaders:
|
| 71 |
+
self.logger.warning(f"Unsupported file type: {suffix}")
|
| 72 |
+
return []
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
return self.loaders[suffix](file_path)
|
| 76 |
+
except Exception as e:
|
| 77 |
+
self.logger.error(f"Error loading {file_path}: {str(e)}")
|
| 78 |
+
return []
|
| 79 |
+
|
| 80 |
+
def _load_text(self, file_path: str) -> List[Document]:
|
| 81 |
+
"""Load text files."""
|
| 82 |
+
try:
|
| 83 |
+
loader = TextLoader(file_path, encoding='utf-8')
|
| 84 |
+
return loader.load()
|
| 85 |
+
except Exception as e:
|
| 86 |
+
# Fallback to manual loading
|
| 87 |
+
try:
|
| 88 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 89 |
+
content = f.read()
|
| 90 |
+
doc = Document(
|
| 91 |
+
page_content=content,
|
| 92 |
+
metadata={"source": file_path, "type": "text"}
|
| 93 |
+
)
|
| 94 |
+
return [doc]
|
| 95 |
+
except Exception as e2:
|
| 96 |
+
self.logger.error(f"Error loading text file: {e2}")
|
| 97 |
+
return []
|
| 98 |
+
|
| 99 |
+
def _load_pdf(self, file_path: str) -> List[Document]:
|
| 100 |
+
"""Load PDF files."""
|
| 101 |
+
try:
|
| 102 |
+
loader = PyPDFLoader(file_path)
|
| 103 |
+
return loader.load()
|
| 104 |
+
except Exception as e:
|
| 105 |
+
# Fallback to PyPDF2
|
| 106 |
+
try:
|
| 107 |
+
reader = PdfReader(file_path)
|
| 108 |
+
documents = []
|
| 109 |
+
|
| 110 |
+
for page_num, page in enumerate(reader.pages):
|
| 111 |
+
page_text = page.extract_text()
|
| 112 |
+
if page_text:
|
| 113 |
+
doc = Document(
|
| 114 |
+
page_content=page_text,
|
| 115 |
+
metadata={
|
| 116 |
+
"source": file_path,
|
| 117 |
+
"type": "pdf",
|
| 118 |
+
"page": page_num + 1
|
| 119 |
+
}
|
| 120 |
+
)
|
| 121 |
+
documents.append(doc)
|
| 122 |
+
|
| 123 |
+
return documents
|
| 124 |
+
except Exception as e2:
|
| 125 |
+
self.logger.error(f"Error loading PDF file: {e2}")
|
| 126 |
+
return []
|
| 127 |
+
|
| 128 |
+
def _load_csv(self, file_path: str) -> List[Document]:
|
| 129 |
+
"""Load CSV files."""
|
| 130 |
+
try:
|
| 131 |
+
loader = CSVLoader(file_path)
|
| 132 |
+
return loader.load()
|
| 133 |
+
except Exception as e:
|
| 134 |
+
# Fallback to pandas
|
| 135 |
+
try:
|
| 136 |
+
df = pd.read_csv(file_path)
|
| 137 |
+
documents = []
|
| 138 |
+
|
| 139 |
+
# Find the column with the most text content
|
| 140 |
+
text_column = self._identify_content_column(df)
|
| 141 |
+
|
| 142 |
+
for _, row in df.iterrows():
|
| 143 |
+
content = str(row[text_column])
|
| 144 |
+
metadata = {
|
| 145 |
+
"source": file_path,
|
| 146 |
+
"type": "csv"
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
# Add other columns as metadata
|
| 150 |
+
for col in df.columns:
|
| 151 |
+
if col != text_column and not pd.isna(row[col]):
|
| 152 |
+
metadata[col] = str(row[col])
|
| 153 |
+
|
| 154 |
+
doc = Document(
|
| 155 |
+
page_content=content,
|
| 156 |
+
metadata=metadata
|
| 157 |
+
)
|
| 158 |
+
documents.append(doc)
|
| 159 |
+
|
| 160 |
+
return documents
|
| 161 |
+
except Exception as e2:
|
| 162 |
+
self.logger.error(f"Error loading CSV file: {e2}")
|
| 163 |
+
return []
|
| 164 |
+
|
| 165 |
+
def _load_json(self, file_path: str) -> List[Document]:
|
| 166 |
+
"""Load JSON files."""
|
| 167 |
+
try:
|
| 168 |
+
# For JSON files, we'll create documents from each entry
|
| 169 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 170 |
+
data = json.load(f)
|
| 171 |
+
|
| 172 |
+
documents = []
|
| 173 |
+
if isinstance(data, list):
|
| 174 |
+
for idx, item in enumerate(data):
|
| 175 |
+
content = json.dumps(item, indent=2)
|
| 176 |
+
doc = Document(
|
| 177 |
+
page_content=content,
|
| 178 |
+
metadata={
|
| 179 |
+
"source": file_path,
|
| 180 |
+
"type": "json",
|
| 181 |
+
"index": idx
|
| 182 |
+
}
|
| 183 |
+
)
|
| 184 |
+
documents.append(doc)
|
| 185 |
+
else:
|
| 186 |
+
content = json.dumps(data, indent=2)
|
| 187 |
+
doc = Document(
|
| 188 |
+
page_content=content,
|
| 189 |
+
metadata={"source": file_path, "type": "json"}
|
| 190 |
+
)
|
| 191 |
+
documents.append(doc)
|
| 192 |
+
|
| 193 |
+
return documents
|
| 194 |
+
except Exception as e:
|
| 195 |
+
self.logger.error(f"Error loading JSON file: {e}")
|
| 196 |
+
return []
|
| 197 |
+
|
| 198 |
+
def _load_markdown(self, file_path: str) -> List[Document]:
|
| 199 |
+
"""Load Markdown files."""
|
| 200 |
+
try:
|
| 201 |
+
loader = UnstructuredMarkdownLoader(file_path)
|
| 202 |
+
return loader.load()
|
| 203 |
+
except Exception as e:
|
| 204 |
+
# Fallback to simple text loading
|
| 205 |
+
try:
|
| 206 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 207 |
+
content = f.read()
|
| 208 |
+
doc = Document(
|
| 209 |
+
page_content=content,
|
| 210 |
+
metadata={"source": file_path, "type": "markdown"}
|
| 211 |
+
)
|
| 212 |
+
return [doc]
|
| 213 |
+
except Exception as e2:
|
| 214 |
+
self.logger.error(f"Error loading Markdown file: {e2}")
|
| 215 |
+
return []
|
| 216 |
+
|
| 217 |
+
def load_directory(self, directory_path: str) -> List[Document]:
|
| 218 |
+
"""
|
| 219 |
+
Load all supported documents from a directory.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
directory_path: Path to the directory
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
List of all loaded documents
|
| 226 |
+
"""
|
| 227 |
+
dir_path = Path(directory_path)
|
| 228 |
+
if not dir_path.exists():
|
| 229 |
+
self.logger.error(f"Directory not found: {directory_path}")
|
| 230 |
+
return []
|
| 231 |
+
|
| 232 |
+
all_documents = []
|
| 233 |
+
for file_path in dir_path.iterdir():
|
| 234 |
+
if file_path.is_file() and file_path.suffix.lower() in self.loaders:
|
| 235 |
+
self.logger.info(f"Loading {file_path.name}")
|
| 236 |
+
documents = self.load_document(str(file_path))
|
| 237 |
+
all_documents.extend(documents)
|
| 238 |
+
|
| 239 |
+
return all_documents
|
| 240 |
+
|
| 241 |
+
def _identify_content_column(self, df: pd.DataFrame) -> str:
|
| 242 |
+
"""
|
| 243 |
+
Identify which column in a DataFrame contains the main content.
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
df: Pandas DataFrame
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Name of the content column
|
| 250 |
+
"""
|
| 251 |
+
# Look for columns with these names
|
| 252 |
+
content_column_names = ["content", "text", "description", "abstract", "body"]
|
| 253 |
+
|
| 254 |
+
for name in content_column_names:
|
| 255 |
+
if name in df.columns:
|
| 256 |
+
return name
|
| 257 |
+
|
| 258 |
+
# If no standard content column found, look for the column with longest strings
|
| 259 |
+
avg_lengths = {}
|
| 260 |
+
for col in df.columns:
|
| 261 |
+
if df[col].dtype == 'object': # Only check string columns
|
| 262 |
+
# Calculate average string length
|
| 263 |
+
avg_length = df[col].astype(str).apply(len).mean()
|
| 264 |
+
avg_lengths[col] = avg_length
|
| 265 |
+
|
| 266 |
+
if avg_lengths:
|
| 267 |
+
# Return column with longest average string length
|
| 268 |
+
return max(avg_lengths.items(), key=lambda x: x[1])[0]
|
| 269 |
+
|
| 270 |
+
# Fallback to first column
|
| 271 |
+
return df.columns[0]
|
| 272 |
+
|
| 273 |
+
def _identify_json_content_field(self, item: Dict) -> Optional[str]:
|
| 274 |
+
"""
|
| 275 |
+
Identify which field in a JSON object contains the main content.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
item: Dictionary representing a JSON object
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
Name of the content field or None if not found
|
| 282 |
+
"""
|
| 283 |
+
# Look for fields with these names
|
| 284 |
+
content_field_names = ["content", "text", "description", "abstract", "body"]
|
| 285 |
+
|
| 286 |
+
for name in content_field_names:
|
| 287 |
+
if name in item and isinstance(item[name], str):
|
| 288 |
+
return name
|
| 289 |
+
|
| 290 |
+
# If no standard content field found, look for the field with longest string
|
| 291 |
+
text_fields = {}
|
| 292 |
+
for key, value in item.items():
|
| 293 |
+
if isinstance(value, str) and len(value) > 50:
|
| 294 |
+
text_fields[key] = len(value)
|
| 295 |
+
|
| 296 |
+
if text_fields:
|
| 297 |
+
# Return field with longest text
|
| 298 |
+
return max(text_fields.items(), key=lambda x: x[1])[0]
|
| 299 |
+
|
| 300 |
+
return None
|
agents/rag_agent/document_processor.py
ADDED
|
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import uuid
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import hashlib
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import nltk
|
| 9 |
+
from nltk.tokenize import sent_tokenize
|
| 10 |
+
import numpy as np
|
| 11 |
+
from langchain.schema import Document # Add this import for Document
|
| 12 |
+
|
| 13 |
+
class MedicalDocumentProcessor:
|
| 14 |
+
"""
|
| 15 |
+
Processes ingested medical/mental health documents: chunking, embedding, and metadata enrichment.
|
| 16 |
+
"""
|
| 17 |
+
def __init__(self, config, embedding_model):
|
| 18 |
+
self.logger = logging.getLogger(__name__)
|
| 19 |
+
required_attrs = ["chunk_size", "chunk_overlap", "processed_docs_dir"]
|
| 20 |
+
for attr in required_attrs:
|
| 21 |
+
if not hasattr(config.rag, attr):
|
| 22 |
+
raise ValueError(f"Missing '{attr}' in config.rag. Please add it to config/rag.yaml.")
|
| 23 |
+
|
| 24 |
+
self.embedding_model = embedding_model
|
| 25 |
+
self.chunk_size = config.rag.chunk_size
|
| 26 |
+
self.chunk_overlap = config.rag.chunk_overlap
|
| 27 |
+
self.processed_docs_dir = Path(config.rag.processed_docs_dir)
|
| 28 |
+
self.processed_docs_dir.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
self.chunking_strategy = getattr(config.rag, "chunking_strategy", "hybrid")
|
| 30 |
+
self.logger.info(f"Using chunking strategy: {self.chunking_strategy}")
|
| 31 |
+
|
| 32 |
+
# Add mental health-specific patterns
|
| 33 |
+
self.document_type_patterns = {
|
| 34 |
+
"mental_health_tip": re.compile(r"(?i)(stress|anxiety|depression|coping|therapy|mindfulness|prevention|tip|advice|support)"),
|
| 35 |
+
"clinical_note": re.compile(r"(?i)(chief complaint|history of present illness|hpi|past medical history|pmh|medications|assessment|plan|review of systems|ros|physical examination|lab results|imaging|impression|followup)"),
|
| 36 |
+
"patient_record": re.compile(r"(?i)(patient information|demographics|vital signs|allergies|immunizations|family history|social history|surgical history|problem list)"),
|
| 37 |
+
"treatment_guidelines": re.compile(r"(?i)(recommendations|guidelines|protocols|indications|contraindications|dosage|administration|monitoring|special populations)"),
|
| 38 |
+
"pharmacology": re.compile(r"(?i)(mechanism of action|pharmacokinetics|pharmacodynamics|dosing|adverse effects|warnings|interactions|storage|pregnancy considerations)"),
|
| 39 |
+
"general_medical": re.compile(r"(?i)(medical|health|wellness|nutrition|exercise|lifestyle|prevention|diagnosis|treatment|symptom|condition)")}
|
| 40 |
+
|
| 41 |
+
self.section_headers = [
|
| 42 |
+
r"^(stress|anxiety|depression|coping|therapy|mindfulness|tip|advice|support)",
|
| 43 |
+
r"^(chief complaint|history of present illness|hpi|past medical history|pmh|medications|assessment|plan|review of systems|ros|physical examination|lab results|imaging|impression|followup)",
|
| 44 |
+
r"^(patient information|demographics|vital signs|allergies|immunizations|family history|social history|surgical history|problem list)",
|
| 45 |
+
r"^(recommendations|guidelines|protocols|indications|contraindications|dosage|administration|monitoring|special populations)",
|
| 46 |
+
r"^(mechanism of action|pharmacokinetics|pharmacodynamics|dosing|adverse effects|warnings|interactions|storage|pregnancy considerations)",
|
| 47 |
+
r"^(medical|health|wellness|nutrition|exercise|lifestyle|prevention|diagnosis|treatment|symptom|condition)"]
|
| 48 |
+
filtered_headers = [header for header in self.section_headers if header.strip()]
|
| 49 |
+
self.section_pattern = re.compile(f"({'|'.join(filtered_headers)})", re.IGNORECASE)
|
| 50 |
+
|
| 51 |
+
# Mental health entity patterns
|
| 52 |
+
self.medical_entity_categories = {
|
| 53 |
+
"mental_health": r"(stress|anxiety|depression|suicide|coping|therapy|counseling|mindfulness|panic|self[- ]harm|hopelessness|support group|resilience|burnout|well-being|mental health)",
|
| 54 |
+
"symptom": r"(headache|fatigue|insomnia|mood swings|irritability|concentration issues|memory problems|appetite changes|sleep disturbances|social withdrawal)",
|
| 55 |
+
"treatment": r"(medication|therapy|cognitive behavioral therapy|CBT|dialectical behavior therapy|DBT|exposure therapy|medication management|psychiatric evaluation|support group|mindfulness training)",
|
| 56 |
+
"diagnosis": r"(bipolar disorder|schizophrenia|post-traumatic stress disorder|PTSD|obsessive[- ]compulsive disorder|OCD|generalized anxiety disorder|GAD|major depressive disorder|MDD|panic disorder|social anxiety disorder|SAD)",
|
| 57 |
+
"risk_factor": r"(genetic predisposition|family history|trauma|substance abuse|chronic illness|stressful life events|social isolation|poor coping skills|low resilience|lack of support)",
|
| 58 |
+
"intervention": r"(cognitive restructuring|exposure therapy|mindfulness meditation|relaxation techniques|stress management|problem-solving skills|assertiveness training|social skills training|self-care strategies|crisis intervention)"}
|
| 59 |
+
all_patterns = [f"(?P<{cat}>{pat})" for cat, pat in self.medical_entity_categories.items()]
|
| 60 |
+
self.medical_entity_pattern = re.compile("|".join(all_patterns), re.IGNORECASE)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def process_documents(self, documents: List[Document]) -> List[Document]:
|
| 65 |
+
"""Process and chunk documents."""
|
| 66 |
+
all_chunks = []
|
| 67 |
+
|
| 68 |
+
for doc in documents:
|
| 69 |
+
# Get content and metadata
|
| 70 |
+
content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
|
| 71 |
+
metadata = doc.metadata if hasattr(doc, 'metadata') else {}
|
| 72 |
+
|
| 73 |
+
# Create chunks based on strategy
|
| 74 |
+
if self.chunking_strategy == "hybrid":
|
| 75 |
+
chunks = self._hybrid_chunking(content, metadata)
|
| 76 |
+
elif self.chunking_strategy == "semantic":
|
| 77 |
+
chunks = self._semantic_chunking(content, metadata)
|
| 78 |
+
else:
|
| 79 |
+
chunks = self._fixed_chunking(content, metadata)
|
| 80 |
+
|
| 81 |
+
# Add embeddings to chunks
|
| 82 |
+
for chunk in chunks:
|
| 83 |
+
if not chunk.metadata.get('embedding'):
|
| 84 |
+
embedding = self.embedding_model.encode(chunk.page_content)
|
| 85 |
+
chunk.metadata['embedding'] = embedding.tolist()
|
| 86 |
+
|
| 87 |
+
all_chunks.extend(chunks)
|
| 88 |
+
|
| 89 |
+
self.logger.info(f"Processed {len(all_chunks)} chunks from {len(documents)} documents")
|
| 90 |
+
return all_chunks
|
| 91 |
+
|
| 92 |
+
def _detect_document_type(self, text: str) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Detect the type of medical document based on content patterns.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
text: Document text
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Document type string
|
| 101 |
+
"""
|
| 102 |
+
type_scores = {}
|
| 103 |
+
|
| 104 |
+
# Check each document type pattern
|
| 105 |
+
for doc_type, pattern in self.document_type_patterns.items():
|
| 106 |
+
matches = pattern.findall(text)
|
| 107 |
+
type_scores[doc_type] = len(matches)
|
| 108 |
+
|
| 109 |
+
# Find the document type with the highest number of matches
|
| 110 |
+
if max(type_scores.values(), default=0) > 0:
|
| 111 |
+
return max(type_scores.items(), key=lambda x: x[1])[0]
|
| 112 |
+
|
| 113 |
+
# Default to general if no clear type
|
| 114 |
+
return "general_medical"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _split_into_paragraphs(self, text: str, section_name: str) -> List[Tuple[str, str, str]]:
|
| 118 |
+
"""
|
| 119 |
+
Split text into paragraph-level chunks.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
text: Text to split
|
| 123 |
+
section_name: Name of the section
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
List of (chunk_text, section_name, level) tuples
|
| 127 |
+
"""
|
| 128 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 129 |
+
chunks = []
|
| 130 |
+
|
| 131 |
+
for i, para in enumerate(paragraphs):
|
| 132 |
+
if not para.strip():
|
| 133 |
+
continue
|
| 134 |
+
|
| 135 |
+
# Check if paragraph is too large
|
| 136 |
+
if len(para.split()) > self.chunk_size:
|
| 137 |
+
# Further split into sentences
|
| 138 |
+
sentences = sent_tokenize(para)
|
| 139 |
+
current_chunk = []
|
| 140 |
+
current_length = 0
|
| 141 |
+
|
| 142 |
+
for sentence in sentences:
|
| 143 |
+
sentence_length = len(sentence.split())
|
| 144 |
+
|
| 145 |
+
if current_length + sentence_length > self.chunk_size and current_chunk:
|
| 146 |
+
# Add current chunk
|
| 147 |
+
chunk_text = " ".join(current_chunk)
|
| 148 |
+
chunks.append((chunk_text, section_name, "paragraph"))
|
| 149 |
+
current_chunk = []
|
| 150 |
+
current_length = 0
|
| 151 |
+
|
| 152 |
+
current_chunk.append(sentence)
|
| 153 |
+
current_length += sentence_length
|
| 154 |
+
|
| 155 |
+
# Add final chunk if not empty
|
| 156 |
+
if current_chunk:
|
| 157 |
+
chunk_text = " ".join(current_chunk)
|
| 158 |
+
chunks.append((chunk_text, section_name, "paragraph"))
|
| 159 |
+
else:
|
| 160 |
+
chunks.append((para.strip(), section_name, "paragraph"))
|
| 161 |
+
|
| 162 |
+
return chunks
|
| 163 |
+
|
| 164 |
+
def _create_sliding_window_chunks(self, text: str) -> List[Tuple[str, str, str]]:
|
| 165 |
+
"""
|
| 166 |
+
Create overlapping chunks using a sliding window approach.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
text: Document text
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
List of (chunk_text, section_name, level) tuples
|
| 173 |
+
"""
|
| 174 |
+
sentences = sent_tokenize(text)
|
| 175 |
+
chunks = []
|
| 176 |
+
|
| 177 |
+
# If very few sentences, return as one chunk
|
| 178 |
+
if len(sentences) <= 3:
|
| 179 |
+
return [(text, "full_document", "document")]
|
| 180 |
+
|
| 181 |
+
# Calculate stride (number of sentences to slide window)
|
| 182 |
+
stride = max(1, (self.chunk_size - self.chunk_overlap) // 20) # Approximate words per sentence
|
| 183 |
+
|
| 184 |
+
# Create chunks with sliding window
|
| 185 |
+
for i in range(0, len(sentences), stride):
|
| 186 |
+
# Determine end index for current window
|
| 187 |
+
window_size = min(i + max(3, self.chunk_size // 20), len(sentences))
|
| 188 |
+
|
| 189 |
+
# Get text for current window
|
| 190 |
+
window_text = " ".join(sentences[i:window_size])
|
| 191 |
+
|
| 192 |
+
# Detect current section if possible
|
| 193 |
+
section_match = self.section_pattern.search(window_text)
|
| 194 |
+
section_name = section_match.group(0) if section_match else "sliding_window"
|
| 195 |
+
|
| 196 |
+
chunks.append((window_text, section_name, "sliding"))
|
| 197 |
+
|
| 198 |
+
return chunks
|
| 199 |
+
|
| 200 |
+
def _create_recursive_chunks(self, text: str) -> List[Tuple[str, str, str]]:
|
| 201 |
+
"""
|
| 202 |
+
Create hierarchical chunks at different levels of granularity.
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
text: Document text
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
List of (chunk_text, section_name, level) tuples
|
| 209 |
+
"""
|
| 210 |
+
chunks = []
|
| 211 |
+
|
| 212 |
+
# Level 1: Document-level chunk (if not too large)
|
| 213 |
+
if len(text.split()) <= self.chunk_size * 2:
|
| 214 |
+
chunks.append((text, "full_document", "document"))
|
| 215 |
+
|
| 216 |
+
# Level 2: Section-level chunks
|
| 217 |
+
section_matches = list(self.section_pattern.finditer(text))
|
| 218 |
+
|
| 219 |
+
if section_matches:
|
| 220 |
+
for i in range(len(section_matches)):
|
| 221 |
+
start_pos = section_matches[i].start()
|
| 222 |
+
section_name = text[section_matches[i].start():section_matches[i].end()].strip()
|
| 223 |
+
|
| 224 |
+
# Determine section end
|
| 225 |
+
if i < len(section_matches) - 1:
|
| 226 |
+
end_pos = section_matches[i+1].start()
|
| 227 |
+
else:
|
| 228 |
+
end_pos = len(text)
|
| 229 |
+
|
| 230 |
+
section_text = text[start_pos:end_pos].strip()
|
| 231 |
+
|
| 232 |
+
# Add section as a chunk
|
| 233 |
+
if section_text and len(section_text.split()) <= self.chunk_size:
|
| 234 |
+
chunks.append((section_text, section_name, "section"))
|
| 235 |
+
|
| 236 |
+
# Level 3: Paragraph-level chunks
|
| 237 |
+
paragraphs = re.split(r'\n\s*\n', section_text)
|
| 238 |
+
|
| 239 |
+
for j, para in enumerate(paragraphs):
|
| 240 |
+
if para.strip() and len(para.split()) <= self.chunk_size:
|
| 241 |
+
chunks.append((para.strip(), section_name, "paragraph"))
|
| 242 |
+
|
| 243 |
+
# Level 4: Sentence-level chunks for important sentences
|
| 244 |
+
if self._contains_important_entities(para):
|
| 245 |
+
sentences = sent_tokenize(para)
|
| 246 |
+
for sentence in sentences:
|
| 247 |
+
if self._contains_important_entities(sentence):
|
| 248 |
+
chunks.append((sentence.strip(), section_name, "sentence"))
|
| 249 |
+
else:
|
| 250 |
+
# No clear sections, fall back to paragraphs and sentences
|
| 251 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 252 |
+
|
| 253 |
+
for para in paragraphs:
|
| 254 |
+
if para.strip() and len(para.split()) <= self.chunk_size:
|
| 255 |
+
chunks.append((para.strip(), "paragraph", "paragraph"))
|
| 256 |
+
|
| 257 |
+
return chunks
|
| 258 |
+
def _embed_chunks(self, chunks: List[Document]) -> List[Document]:
|
| 259 |
+
"""Add embeddings to chunks."""
|
| 260 |
+
for chunk in chunks:
|
| 261 |
+
if chunk.page_content:
|
| 262 |
+
embedding = self.embedding_model.encode(chunk.page_content)
|
| 263 |
+
chunk.metadata['embedding'] = embedding.tolist()
|
| 264 |
+
return chunks
|
| 265 |
+
|
| 266 |
+
def _hybrid_chunking(self, content: str, metadata: Dict) -> List[Document]:
|
| 267 |
+
"""Hybrid chunking combining semantic and fixed-size approaches."""
|
| 268 |
+
# First apply semantic chunking
|
| 269 |
+
semantic_chunks = self._semantic_chunking(content, metadata)
|
| 270 |
+
|
| 271 |
+
# Then apply fixed-size chunking to large semantic chunks
|
| 272 |
+
final_chunks = []
|
| 273 |
+
for chunk in semantic_chunks:
|
| 274 |
+
if len(chunk.page_content) > self.chunk_size * 2:
|
| 275 |
+
# Break down large chunks
|
| 276 |
+
sub_chunks = self._fixed_chunking(chunk.page_content, chunk.metadata)
|
| 277 |
+
final_chunks.extend(sub_chunks)
|
| 278 |
+
else:
|
| 279 |
+
final_chunks.append(chunk)
|
| 280 |
+
|
| 281 |
+
# Add embeddings to all chunks
|
| 282 |
+
final_chunks = self._embed_chunks(final_chunks)
|
| 283 |
+
|
| 284 |
+
return final_chunks
|
| 285 |
+
|
| 286 |
+
def _semantic_chunking(self, content: str, metadata: Dict) -> List[Document]:
|
| 287 |
+
"""Chunk based on semantic boundaries (paragraphs, sections)."""
|
| 288 |
+
# Split by double newlines for paragraphs
|
| 289 |
+
paragraphs = content.split('\n\n')
|
| 290 |
+
chunks = []
|
| 291 |
+
current_chunk = ""
|
| 292 |
+
|
| 293 |
+
for para in paragraphs:
|
| 294 |
+
if len(current_chunk) + len(para) < self.chunk_size:
|
| 295 |
+
current_chunk += para + "\n\n"
|
| 296 |
+
else:
|
| 297 |
+
if current_chunk:
|
| 298 |
+
chunks.append(Document(
|
| 299 |
+
page_content=current_chunk.strip(),
|
| 300 |
+
metadata=metadata.copy()
|
| 301 |
+
))
|
| 302 |
+
current_chunk = para + "\n\n"
|
| 303 |
+
|
| 304 |
+
if current_chunk:
|
| 305 |
+
chunks.append(Document(
|
| 306 |
+
page_content=current_chunk.strip(),
|
| 307 |
+
metadata=metadata.copy()
|
| 308 |
+
))
|
| 309 |
+
|
| 310 |
+
return chunks
|
| 311 |
+
|
| 312 |
+
def _fixed_chunking(self, content: str, metadata: Dict) -> List[Document]:
|
| 313 |
+
"""Fixed-size chunking with overlap."""
|
| 314 |
+
chunks = []
|
| 315 |
+
start = 0
|
| 316 |
+
|
| 317 |
+
while start < len(content):
|
| 318 |
+
end = start + self.chunk_size
|
| 319 |
+
chunk_text = content[start:end]
|
| 320 |
+
|
| 321 |
+
# Ensure we don't cut in the middle of a word
|
| 322 |
+
if end < len(content) and not content[end].isspace():
|
| 323 |
+
# Find the last space before the end
|
| 324 |
+
last_space = chunk_text.rfind(' ')
|
| 325 |
+
if last_space > 0:
|
| 326 |
+
end = start + last_space
|
| 327 |
+
chunk_text = content[start:end]
|
| 328 |
+
|
| 329 |
+
chunks.append(Document(
|
| 330 |
+
page_content=chunk_text.strip(),
|
| 331 |
+
metadata=metadata.copy()
|
| 332 |
+
))
|
| 333 |
+
|
| 334 |
+
start = end - self.chunk_overlap
|
| 335 |
+
|
| 336 |
+
return chunks
|
| 337 |
+
|
| 338 |
+
def _chunk_by_sentences(self, text: str, section_name: str, chunk_size: int) -> List[Tuple[str, str, str]]:
|
| 339 |
+
"""
|
| 340 |
+
Create chunks by grouping sentences while respecting chunk size.
|
| 341 |
+
|
| 342 |
+
Args:
|
| 343 |
+
text: Text to chunk
|
| 344 |
+
section_name: Name of the section
|
| 345 |
+
chunk_size: Maximum chunk size in words
|
| 346 |
+
|
| 347 |
+
Returns:
|
| 348 |
+
List of (chunk_text, section_name, level) tuples
|
| 349 |
+
"""
|
| 350 |
+
sentences = sent_tokenize(text)
|
| 351 |
+
chunks = []
|
| 352 |
+
current_chunk = []
|
| 353 |
+
current_length = 0
|
| 354 |
+
|
| 355 |
+
for sentence in sentences:
|
| 356 |
+
sentence_words = sentence.split()
|
| 357 |
+
sentence_length = len(sentence_words)
|
| 358 |
+
|
| 359 |
+
# If adding this sentence exceeds chunk size and we already have content
|
| 360 |
+
if current_length + sentence_length > chunk_size and current_chunk:
|
| 361 |
+
# Save current chunk
|
| 362 |
+
chunk_text = " ".join(current_chunk)
|
| 363 |
+
chunks.append((chunk_text, section_name, "sentences"))
|
| 364 |
+
|
| 365 |
+
# Start new chunk with overlap
|
| 366 |
+
# Find a good overlap point that doesn't split mid-thought
|
| 367 |
+
overlap_sentences = min(2, len(current_chunk))
|
| 368 |
+
current_chunk = current_chunk[-overlap_sentences:]
|
| 369 |
+
current_length = len(" ".join(current_chunk).split())
|
| 370 |
+
|
| 371 |
+
# Add sentence to current chunk
|
| 372 |
+
current_chunk.append(sentence)
|
| 373 |
+
current_length += sentence_length
|
| 374 |
+
|
| 375 |
+
# Add final chunk if not empty
|
| 376 |
+
if current_chunk:
|
| 377 |
+
chunk_text = " ".join(current_chunk)
|
| 378 |
+
chunks.append((chunk_text, section_name, "sentences"))
|
| 379 |
+
|
| 380 |
+
return chunks
|
| 381 |
+
|
| 382 |
+
def _contains_important_entities(self, text: str) -> bool:
|
| 383 |
+
"""
|
| 384 |
+
Check if text contains important medical entities.
|
| 385 |
+
|
| 386 |
+
Args:
|
| 387 |
+
text: Text to check
|
| 388 |
+
|
| 389 |
+
Returns:
|
| 390 |
+
Boolean indicating presence of important entities
|
| 391 |
+
"""
|
| 392 |
+
entities = self._extract_medical_entities(text)
|
| 393 |
+
return len(entities) > 0
|
| 394 |
+
|
| 395 |
+
def _calculate_chunk_importance(self, text: str, position: int, total_chunks: int) -> float:
|
| 396 |
+
"""
|
| 397 |
+
Calculate importance score for a chunk based on various factors.
|
| 398 |
+
|
| 399 |
+
Args:
|
| 400 |
+
text: Chunk text
|
| 401 |
+
position: Position in document
|
| 402 |
+
total_chunks: Total number of chunks
|
| 403 |
+
|
| 404 |
+
Returns:
|
| 405 |
+
Importance score between 0 and 1
|
| 406 |
+
"""
|
| 407 |
+
# Extract entities and count them
|
| 408 |
+
entities = self._extract_medical_entities(text)
|
| 409 |
+
entity_count = len(entities)
|
| 410 |
+
|
| 411 |
+
# Calculate entity density
|
| 412 |
+
word_count = len(text.split())
|
| 413 |
+
entity_density = entity_count / max(1, word_count / 100)
|
| 414 |
+
|
| 415 |
+
# Position importance - first and last chunks often contain key information
|
| 416 |
+
position_score = 0.0
|
| 417 |
+
if position == 0 or position == total_chunks - 1:
|
| 418 |
+
position_score = 0.2
|
| 419 |
+
elif position < total_chunks * 0.2 or position > total_chunks * 0.8:
|
| 420 |
+
position_score = 0.1
|
| 421 |
+
|
| 422 |
+
# Check for important keywords
|
| 423 |
+
keyword_score = 0.0
|
| 424 |
+
important_keywords = ["significant", "important", "critical", "essential", "key",
|
| 425 |
+
"finding", "diagnosis", "recommend", "conclude", "summary"]
|
| 426 |
+
for keyword in important_keywords:
|
| 427 |
+
if re.search(r"\b" + re.escape(keyword) + r"\b", text, re.IGNORECASE):
|
| 428 |
+
keyword_score += 0.05
|
| 429 |
+
keyword_score = min(0.2, keyword_score)
|
| 430 |
+
|
| 431 |
+
# Combine scores
|
| 432 |
+
importance_score = min(1.0, 0.3 * entity_density + position_score + keyword_score)
|
| 433 |
+
|
| 434 |
+
return importance_score
|
| 435 |
+
|
| 436 |
+
def _extract_medical_entities(self, text: str) -> Dict[str, List[str]]:
|
| 437 |
+
"""
|
| 438 |
+
Extract medical entities from text by category.
|
| 439 |
+
|
| 440 |
+
Args:
|
| 441 |
+
text: Input text
|
| 442 |
+
|
| 443 |
+
Returns:
|
| 444 |
+
Dictionary of categorized medical entities
|
| 445 |
+
"""
|
| 446 |
+
categorized_entities = {}
|
| 447 |
+
|
| 448 |
+
for category, pattern in self.medical_entity_categories.items():
|
| 449 |
+
category_pattern = re.compile(pattern)
|
| 450 |
+
matches = set(m.group(0).lower() for m in category_pattern.finditer(text))
|
| 451 |
+
if matches:
|
| 452 |
+
categorized_entities[category] = list(matches)
|
| 453 |
+
|
| 454 |
+
return categorized_entities
|
| 455 |
+
|
| 456 |
+
def _save_processed_chunks(self, doc_id: str, chunks: List[Dict[str, Any]]):
|
| 457 |
+
"""
|
| 458 |
+
Save processed chunks to disk for potential reuse.
|
| 459 |
+
|
| 460 |
+
Args:
|
| 461 |
+
doc_id: Document identifier
|
| 462 |
+
chunks: List of processed chunks
|
| 463 |
+
"""
|
| 464 |
+
try:
|
| 465 |
+
import json
|
| 466 |
+
|
| 467 |
+
# Create filename
|
| 468 |
+
filename = f"{doc_id}_processed.json"
|
| 469 |
+
filepath = self.processed_docs_dir / filename
|
| 470 |
+
|
| 471 |
+
# Save chunks without embeddings (to save space)
|
| 472 |
+
chunks_without_embeddings = []
|
| 473 |
+
for chunk in chunks:
|
| 474 |
+
chunk_copy = chunk.copy()
|
| 475 |
+
# Remove embedding as it's large and can be regenerated
|
| 476 |
+
del chunk_copy["embedding"]
|
| 477 |
+
chunks_without_embeddings.append(chunk_copy)
|
| 478 |
+
|
| 479 |
+
with open(filepath, 'w') as f:
|
| 480 |
+
json.dump(chunks_without_embeddings, f)
|
| 481 |
+
|
| 482 |
+
self.logger.info(f"Saved processed chunks to {filepath}")
|
| 483 |
+
except Exception as e:
|
| 484 |
+
self.logger.warning(f"Failed to save processed chunks: {e}")
|
| 485 |
+
|
| 486 |
+
def batch_process_documents(self, documents: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 487 |
+
"""
|
| 488 |
+
Process a batch of documents.
|
| 489 |
+
|
| 490 |
+
Args:
|
| 491 |
+
documents: List of dictionaries with 'content' and 'metadata' keys
|
| 492 |
+
|
| 493 |
+
Returns:
|
| 494 |
+
List of processed document chunks with embeddings
|
| 495 |
+
"""
|
| 496 |
+
all_processed_chunks = []
|
| 497 |
+
|
| 498 |
+
for doc in documents:
|
| 499 |
+
try:
|
| 500 |
+
processed_chunks = self.process_document(doc["content"], doc["metadata"])
|
| 501 |
+
all_processed_chunks.extend(processed_chunks)
|
| 502 |
+
except Exception as e:
|
| 503 |
+
self.logger.error(f"Error processing document: {e}")
|
| 504 |
+
# Continue with the next document
|
| 505 |
+
continue
|
| 506 |
+
|
| 507 |
+
return all_processed_chunks
|
agents/rag_agent/query_processor.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
import uuid
|
| 4 |
+
from typing import List, Dict, Any, Optional, Tuple, Union
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
class QueryProcessor:
|
| 8 |
+
"""
|
| 9 |
+
Processor for mental health queries with entity extraction and specialty detection.
|
| 10 |
+
"""
|
| 11 |
+
def __init__(self, config, embedding_model):
|
| 12 |
+
self.logger = logging.getLogger(__name__)
|
| 13 |
+
self.embedding_model = embedding_model
|
| 14 |
+
self.config = config
|
| 15 |
+
|
| 16 |
+
# Only mental health entity patterns
|
| 17 |
+
self.medical_entity_categories = {
|
| 18 |
+
"mental_health": r"(stress|anxiety|depression|suicide|coping|therapy|counseling|mindfulness|panic|self[- ]harm|hopelessness|support group|resilience|burnout|well-being|mental health)",
|
| 19 |
+
"symptom": r"(insomnia|mood swings|irritability|concentration issues|memory problems|appetite changes|sleep disturbances|social withdrawal|fatigue|hopelessness)",
|
| 20 |
+
"treatment": r"(medication|therapy|cognitive behavioral therapy|CBT|dialectical behavior therapy|DBT|exposure therapy|medication management|psychiatric evaluation|support group|mindfulness training)",
|
| 21 |
+
"diagnosis": r"(bipolar disorder|schizophrenia|post-traumatic stress disorder|PTSD|obsessive[- ]compulsive disorder|OCD|generalized anxiety disorder|GAD|major depressive disorder|MDD|panic disorder|social anxiety disorder|SAD)",
|
| 22 |
+
"risk_factor": r"(genetic predisposition|family history|trauma|substance abuse|chronic illness|stressful life events|social isolation|poor coping skills|low resilience|lack of support)",
|
| 23 |
+
"intervention": r"(cognitive restructuring|exposure therapy|mindfulness meditation|relaxation techniques|stress management|problem-solving skills|assertiveness training|social skills training|self-care strategies|crisis intervention)"
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Only mental health specialty keywords
|
| 27 |
+
self.specialty_keywords = {
|
| 28 |
+
"psychiatry": [
|
| 29 |
+
"mental health", "depression", "anxiety", "psychiatric", "disorder",
|
| 30 |
+
"schizophrenia", "bipolar", "therapy", "behavioral", "psychological",
|
| 31 |
+
"stress", "counseling", "mindfulness", "panic", "self-harm", "support group",
|
| 32 |
+
"resilience", "burnout", "well-being"
|
| 33 |
+
]
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
def process_query(self, query: str) -> Dict[str, Any]:
|
| 37 |
+
"""Process query to extract metadata and intent - NOT THE FULL RAG PIPELINE."""
|
| 38 |
+
try:
|
| 39 |
+
# Extract medical entities
|
| 40 |
+
entities = self._extract_medical_entities(query)
|
| 41 |
+
|
| 42 |
+
# Determine query intent
|
| 43 |
+
intent = self._determine_query_intent(query)
|
| 44 |
+
|
| 45 |
+
# Normalize query to handle typos and variations
|
| 46 |
+
normalized_query = self._normalize_query(query)
|
| 47 |
+
|
| 48 |
+
# Determine medical specialty
|
| 49 |
+
specialty = self._detect_specialty(query)
|
| 50 |
+
|
| 51 |
+
# Create metadata/filters for retrieval
|
| 52 |
+
filters = {
|
| 53 |
+
'query_id': str(uuid.uuid4()),
|
| 54 |
+
'timestamp': datetime.now().isoformat(),
|
| 55 |
+
'query_intent': intent,
|
| 56 |
+
'medical_entities': entities,
|
| 57 |
+
'specialty': specialty
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
self.logger.info(f"Processed query with filters: {filters}")
|
| 61 |
+
|
| 62 |
+
return {
|
| 63 |
+
'filters': filters,
|
| 64 |
+
'intent': intent,
|
| 65 |
+
'entities': entities,
|
| 66 |
+
'expanded_query': self._expand_query(query)
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
except Exception as e:
|
| 70 |
+
self.logger.error(f"Error processing query: {e}")
|
| 71 |
+
return self._get_default_metadata()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _normalize_query(self, query: str) -> str:
|
| 76 |
+
"""Normalize query to handle common typos and variations."""
|
| 77 |
+
import re
|
| 78 |
+
|
| 79 |
+
# Convert to lowercase
|
| 80 |
+
normalized = query.lower()
|
| 81 |
+
|
| 82 |
+
# Fix common typos or abbreviations
|
| 83 |
+
typo_corrections = {
|
| 84 |
+
r'\brad\b': 'road', # rad -> road
|
| 85 |
+
r'\bdepressed\b': 'depression',
|
| 86 |
+
r'\bim\b': "i'm",
|
| 87 |
+
r'\bu\b': 'you',
|
| 88 |
+
r'\bur\b': 'your',
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
for pattern, replacement in typo_corrections.items():
|
| 92 |
+
normalized = re.sub(pattern, replacement, normalized)
|
| 93 |
+
|
| 94 |
+
return normalized
|
| 95 |
+
|
| 96 |
+
def _expand_query(self, query: str) -> str:
|
| 97 |
+
"""Expand query with related terms."""
|
| 98 |
+
expansions = {
|
| 99 |
+
"depression": "depression mood sadness hopelessness",
|
| 100 |
+
"anxiety": "anxiety worry nervousness panic",
|
| 101 |
+
"stress": "stress pressure tension burnout",
|
| 102 |
+
"therapy": "therapy counseling psychotherapy CBT DBT",
|
| 103 |
+
"suicide": "suicide self-harm hopelessness crisis",
|
| 104 |
+
"support": "support group counseling help"
|
| 105 |
+
}
|
| 106 |
+
expanded = query
|
| 107 |
+
for term, expansion in expansions.items():
|
| 108 |
+
if re.search(r"\b" + re.escape(term) + r"\b", query.lower()):
|
| 109 |
+
expanded = f"{expanded} {expansion}"
|
| 110 |
+
return expanded
|
| 111 |
+
|
| 112 |
+
def _extract_medical_entities(self, text: str) -> Dict[str, List[str]]:
|
| 113 |
+
"""Extract categorized medical entities from text."""
|
| 114 |
+
categorized_entities = {}
|
| 115 |
+
for category, pattern in self.medical_entity_categories.items():
|
| 116 |
+
category_pattern = re.compile(pattern, re.IGNORECASE)
|
| 117 |
+
matches = set(m.group(0).lower() for m in category_pattern.finditer(text))
|
| 118 |
+
if matches:
|
| 119 |
+
categorized_entities[category] = list(matches)
|
| 120 |
+
return categorized_entities
|
| 121 |
+
|
| 122 |
+
def _detect_specialty(self, text: str) -> Optional[str]:
|
| 123 |
+
"""Detect medical specialty from text."""
|
| 124 |
+
text_lower = text.lower()
|
| 125 |
+
for specialty, keywords in self.specialty_keywords.items():
|
| 126 |
+
for keyword in keywords:
|
| 127 |
+
if re.search(r"\b" + re.escape(keyword.lower()) + r"\b", text_lower):
|
| 128 |
+
return specialty
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
def _determine_query_intent(self, text: str) -> str:
|
| 132 |
+
"""Determine the intent of the query."""
|
| 133 |
+
text_lower = text.lower()
|
| 134 |
+
if re.search(r"\b(what is|define|explain|describe|meaning of)\b", text_lower):
|
| 135 |
+
return "definition"
|
| 136 |
+
elif re.search(r"\b(treat|therapy|medication|cure|manage|drug|prescription)\b", text_lower):
|
| 137 |
+
return "treatment"
|
| 138 |
+
elif re.search(r"\b(diagnose|diagnostic|symptom|sign|identify|determine)\b", text_lower):
|
| 139 |
+
return "diagnosis"
|
| 140 |
+
elif re.search(r"\b(prevent|preventive|avoid|risk factor|reduction)\b", text_lower):
|
| 141 |
+
return "prevention"
|
| 142 |
+
return "general_information"
|
agents/rag_agent/reranker.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Any
|
| 3 |
+
from sentence_transformers import CrossEncoder
|
| 4 |
+
from langchain_core.documents import Document
|
| 5 |
+
|
| 6 |
+
class Reranker:
|
| 7 |
+
"""
|
| 8 |
+
Reranks retrieved documents using a cross-encoder model for more accurate results.
|
| 9 |
+
"""
|
| 10 |
+
def __init__(self, config):
|
| 11 |
+
"""
|
| 12 |
+
Initialize the reranker with configuration.
|
| 13 |
+
Args:
|
| 14 |
+
config: Configuration object containing reranker settings
|
| 15 |
+
"""
|
| 16 |
+
self.logger = logging.getLogger(__name__)
|
| 17 |
+
try:
|
| 18 |
+
if not hasattr(config.rag, "reranker_model"):
|
| 19 |
+
raise ValueError("Missing 'reranker_model' in config.rag. Please add it to config/rag.yaml.")
|
| 20 |
+
self.model_name = config.rag.reranker_model
|
| 21 |
+
self.logger.info(f"Loading reranker model: {self.model_name}")
|
| 22 |
+
self.model = CrossEncoder(self.model_name)
|
| 23 |
+
self.top_k = getattr(config.rag, "reranker_top_k", 5)
|
| 24 |
+
except Exception as e:
|
| 25 |
+
self.logger.error(f"Error loading reranker model: {e}")
|
| 26 |
+
raise
|
| 27 |
+
|
| 28 |
+
def rerank(self, query: str, documents: List[Document], top_k: int = 5) -> List[Document]:
|
| 29 |
+
"""
|
| 30 |
+
Rerank documents based on relevance to the query.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
query: The user query
|
| 34 |
+
documents: List of Document objects to rerank
|
| 35 |
+
top_k: Number of top documents to return
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
List of reranked Document objects
|
| 39 |
+
"""
|
| 40 |
+
if not documents:
|
| 41 |
+
return []
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
# Create pairs of (query, document_content) for the reranker
|
| 45 |
+
pairs = []
|
| 46 |
+
for doc in documents:
|
| 47 |
+
# Use doc.page_content instead of doc['content']
|
| 48 |
+
content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
|
| 49 |
+
pairs.append([query, content])
|
| 50 |
+
|
| 51 |
+
# Get scores from the reranker
|
| 52 |
+
scores = self.model.predict(pairs)
|
| 53 |
+
|
| 54 |
+
# Combine documents with their scores
|
| 55 |
+
doc_scores = list(zip(documents, scores))
|
| 56 |
+
|
| 57 |
+
# Sort by score (descending)
|
| 58 |
+
doc_scores.sort(key=lambda x: x[1], reverse=True)
|
| 59 |
+
|
| 60 |
+
# Return top_k documents
|
| 61 |
+
reranked_docs = [doc for doc, score in doc_scores[:top_k]]
|
| 62 |
+
|
| 63 |
+
self.logger.info(f"Reranked {len(documents)} documents, returning top {len(reranked_docs)}")
|
| 64 |
+
return reranked_docs
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
self.logger.error(f"Error during reranking: {e}")
|
| 68 |
+
self.logger.warning("Falling back to original ranking")
|
| 69 |
+
return documents[:top_k]
|
agents/rag_agent/response_generator.py
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# File: agents/rag_agent/response_generator.py
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
|
| 7 |
+
class ResponseGenerator:
|
| 8 |
+
"""
|
| 9 |
+
Generates structured responses with empathy, solution, and recommendations using RAG pipeline.
|
| 10 |
+
"""
|
| 11 |
+
def __init__(self, config=None, llm=None):
|
| 12 |
+
"""Initialize ResponseGenerator with optional config and LLM"""
|
| 13 |
+
self.logger = logging.getLogger(__name__)
|
| 14 |
+
self.config = config
|
| 15 |
+
self.llm = llm
|
| 16 |
+
|
| 17 |
+
# Set default values if config is provided
|
| 18 |
+
if config and hasattr(config, 'rag'):
|
| 19 |
+
self.max_context_length = getattr(config.rag, "max_context_length", 2048)
|
| 20 |
+
self.include_sources = getattr(config.rag, "include_sources", True)
|
| 21 |
+
else:
|
| 22 |
+
self.max_context_length = 2048
|
| 23 |
+
self.include_sources = True
|
| 24 |
+
|
| 25 |
+
def generate_response(self, query: str, retrieved_docs: List[Any],
|
| 26 |
+
chat_history: Optional[str] = None,
|
| 27 |
+
user_emotion: Optional[str] = None,
|
| 28 |
+
mental_health_status: Optional[str] = None,
|
| 29 |
+
user_context: Optional[Dict] = None) -> Dict[str, Any]:
|
| 30 |
+
"""Generate structured response using RAG pipeline with guaranteed structure."""
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
print(f"[ResponseGenerator] Processing: {query[:50]}...")
|
| 34 |
+
print(f"[ResponseGenerator] Emotion: {user_emotion}, Status: {mental_health_status}")
|
| 35 |
+
|
| 36 |
+
# Extract sources from documents
|
| 37 |
+
sources = self._extract_sources(retrieved_docs)
|
| 38 |
+
|
| 39 |
+
# Build context from retrieved documents
|
| 40 |
+
context = self._build_context_from_docs(retrieved_docs)
|
| 41 |
+
|
| 42 |
+
# Get user info
|
| 43 |
+
emotion = user_emotion or "neutral"
|
| 44 |
+
status = mental_health_status or "Unknown"
|
| 45 |
+
message_count = user_context.get('message_count', 1) if user_context else 1
|
| 46 |
+
|
| 47 |
+
# Try RAG-enhanced structured response first
|
| 48 |
+
try:
|
| 49 |
+
print("[ResponseGenerator] Generating RAG-enhanced structured response...")
|
| 50 |
+
response_text = self._generate_rag_structured_response(
|
| 51 |
+
query, context, emotion, status, message_count
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Verify structure is present
|
| 55 |
+
if self._verify_response_structure(response_text):
|
| 56 |
+
print("[ResponseGenerator] RAG response has complete structure")
|
| 57 |
+
else:
|
| 58 |
+
print("[ResponseGenerator] RAG response missing structure, enhancing...")
|
| 59 |
+
response_text = self._enhance_with_guaranteed_structure(
|
| 60 |
+
response_text, query, emotion, status
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
except Exception as llm_error:
|
| 64 |
+
print(f"[ResponseGenerator] LLM generation failed: {llm_error}")
|
| 65 |
+
print("[ResponseGenerator] Using guaranteed structured fallback...")
|
| 66 |
+
response_text = self._build_structured_response(query, emotion, status)
|
| 67 |
+
|
| 68 |
+
confidence = self._calculate_confidence(sources)
|
| 69 |
+
|
| 70 |
+
print(f"[ResponseGenerator] Final response: {response_text[:100]}...")
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
"response": response_text,
|
| 74 |
+
"sources": sources,
|
| 75 |
+
"confidence": confidence
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
self.logger.error(f"Error generating response: {e}")
|
| 80 |
+
return self._generate_guaranteed_structure(query, user_emotion, mental_health_status, user_context)
|
| 81 |
+
|
| 82 |
+
def _build_context_from_docs(self, retrieved_docs: List[Any]) -> str:
|
| 83 |
+
"""Build context from RAG pipeline retrieved documents."""
|
| 84 |
+
context_parts = []
|
| 85 |
+
|
| 86 |
+
for i, doc in enumerate(retrieved_docs[:3]):
|
| 87 |
+
content = ""
|
| 88 |
+
if hasattr(doc, 'page_content'):
|
| 89 |
+
content = doc.page_content
|
| 90 |
+
elif isinstance(doc, dict):
|
| 91 |
+
content = doc.get('content', doc.get('page_content', ''))
|
| 92 |
+
else:
|
| 93 |
+
content = str(doc)
|
| 94 |
+
|
| 95 |
+
if content:
|
| 96 |
+
# Truncate for context window
|
| 97 |
+
truncated_content = content[:400] + "..." if len(content) > 400 else content
|
| 98 |
+
context_parts.append(f"[Document {i+1}]\n{truncated_content}")
|
| 99 |
+
|
| 100 |
+
return "\n\n".join(context_parts) if context_parts else "No specific context available."
|
| 101 |
+
|
| 102 |
+
def _generate_rag_structured_response(self, query: str, context: str, emotion: str, status: str, message_count: int) -> str:
|
| 103 |
+
"""Generate response using RAG context with structured prompt."""
|
| 104 |
+
|
| 105 |
+
if not self.llm:
|
| 106 |
+
# Fallback if no LLM is provided
|
| 107 |
+
return self._build_structured_response(query, emotion, status)
|
| 108 |
+
|
| 109 |
+
structured_prompt = f"""You are a compassionate mental health support assistant. Using the provided context, create a response with EXACTLY 3 sections:
|
| 110 |
+
|
| 111 |
+
USER QUERY: "{query}"
|
| 112 |
+
USER EMOTION: {emotion}
|
| 113 |
+
MENTAL HEALTH STATUS: {status}
|
| 114 |
+
MESSAGE COUNT: {message_count}
|
| 115 |
+
|
| 116 |
+
CONTEXT FROM RAG PIPELINE:
|
| 117 |
+
{context}
|
| 118 |
+
|
| 119 |
+
CRITICAL: Your response MUST have ALL THREE sections in this order:
|
| 120 |
+
|
| 121 |
+
1. EMPATHY/ACKNOWLEDGEMENT (Start with "I understand..." or "I hear..." or "I can see..."):
|
| 122 |
+
- Acknowledge their specific feelings from the query
|
| 123 |
+
- Validate their experience
|
| 124 |
+
- Show understanding and support
|
| 125 |
+
|
| 126 |
+
2. SOLUTION/INFORMATION (Include words like "can help", "try", "practice", "research shows"):
|
| 127 |
+
- Use the context to provide relevant information about their concern
|
| 128 |
+
- Explain what might be happening or why they feel this way
|
| 129 |
+
- Offer evidence-based insights or coping strategies
|
| 130 |
+
|
| 131 |
+
3. RECOMMENDATIONS (Include words like "recommend", "consider", "suggest"):
|
| 132 |
+
- Give concrete next steps based on their status ({status})
|
| 133 |
+
- Suggest professional help if needed
|
| 134 |
+
- Provide specific actions they can take
|
| 135 |
+
|
| 136 |
+
Use the RAG context to make your response more informative and specific. Keep it 6-9 sentences total. Be warm and conversational, not clinical.
|
| 137 |
+
|
| 138 |
+
Response:"""
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
response = self.llm.invoke(structured_prompt)
|
| 142 |
+
return response.content if hasattr(response, 'content') else str(response)
|
| 143 |
+
except Exception as e:
|
| 144 |
+
print(f"Error invoking LLM: {e}")
|
| 145 |
+
return self._build_structured_response(query, emotion, status)
|
| 146 |
+
|
| 147 |
+
def _verify_response_structure(self, response_text: str) -> bool:
|
| 148 |
+
"""Verify the response has all three required sections."""
|
| 149 |
+
|
| 150 |
+
# Check for empathy keywords
|
| 151 |
+
has_empathy = any(word in response_text.lower() for word in [
|
| 152 |
+
'understand', 'hear', 'see', 'sorry', 'valid', 'difficult', 'acknowledge'
|
| 153 |
+
])
|
| 154 |
+
|
| 155 |
+
# Check for solution keywords
|
| 156 |
+
has_solution = any(word in response_text.lower() for word in [
|
| 157 |
+
'try', 'practice', 'can help', 'technique', 'strategy', 'approach',
|
| 158 |
+
'research shows', 'studies', 'evidence'
|
| 159 |
+
])
|
| 160 |
+
|
| 161 |
+
# Check for recommendation keywords
|
| 162 |
+
has_recommendations = any(word in response_text.lower() for word in [
|
| 163 |
+
'recommend', 'consider', 'suggest', 'professional', 'counselor',
|
| 164 |
+
'therapist', 'healthcare'
|
| 165 |
+
])
|
| 166 |
+
|
| 167 |
+
print(f"[Structure Check] Empathy: {has_empathy}, Solution: {has_solution}, Recommendations: {has_recommendations}")
|
| 168 |
+
|
| 169 |
+
return has_empathy and has_solution and has_recommendations
|
| 170 |
+
|
| 171 |
+
def _enhance_with_guaranteed_structure(self, partial_response: str, query: str, emotion: str, status: str) -> str:
|
| 172 |
+
"""Enhance partial response to ensure complete structure."""
|
| 173 |
+
|
| 174 |
+
# Analyze what's missing and add it
|
| 175 |
+
has_empathy = any(word in partial_response.lower() for word in ['understand', 'hear', 'sorry', 'valid'])
|
| 176 |
+
has_solution = any(word in partial_response.lower() for word in ['try', 'practice', 'can help', 'strategy'])
|
| 177 |
+
has_recommendations = any(word in partial_response.lower() for word in ['recommend', 'consider', 'suggest'])
|
| 178 |
+
|
| 179 |
+
enhanced_parts = []
|
| 180 |
+
|
| 181 |
+
# Add empathy if missing
|
| 182 |
+
if not has_empathy:
|
| 183 |
+
empathy = self._generate_empathy_section(query, emotion)
|
| 184 |
+
enhanced_parts.append(empathy)
|
| 185 |
+
|
| 186 |
+
# Add the existing response
|
| 187 |
+
enhanced_parts.append(partial_response)
|
| 188 |
+
|
| 189 |
+
# Add solution if missing
|
| 190 |
+
if not has_solution:
|
| 191 |
+
solution = self._generate_solution_section(query, emotion)
|
| 192 |
+
enhanced_parts.append(solution)
|
| 193 |
+
|
| 194 |
+
# Add recommendations if missing
|
| 195 |
+
if not has_recommendations:
|
| 196 |
+
recommendations = self._generate_recommendations_section(query, status)
|
| 197 |
+
enhanced_parts.append(recommendations)
|
| 198 |
+
|
| 199 |
+
return " ".join(enhanced_parts)
|
| 200 |
+
|
| 201 |
+
def _generate_empathy_section(self, query: str, emotion: str) -> str:
|
| 202 |
+
"""Generate empathy section based on query."""
|
| 203 |
+
query_lower = query.lower()
|
| 204 |
+
|
| 205 |
+
if "stress" in query_lower and ("school" in query_lower or "work" in query_lower):
|
| 206 |
+
return "I understand that you're feeling overwhelmed by academic/work pressure, and these feelings are completely valid."
|
| 207 |
+
elif "anxiety" in query_lower:
|
| 208 |
+
return "I hear that anxiety is making things really challenging for you right now."
|
| 209 |
+
elif "sad" in query_lower or "depressed" in query_lower:
|
| 210 |
+
return "I can see that you're going through a difficult time with these heavy feelings."
|
| 211 |
+
else:
|
| 212 |
+
return f"I understand that you're dealing with {emotion} feelings, and I want you to know your experience is valid."
|
| 213 |
+
|
| 214 |
+
def _generate_solution_section(self, query: str, emotion: str) -> str:
|
| 215 |
+
"""Generate solution section based on query."""
|
| 216 |
+
query_lower = query.lower()
|
| 217 |
+
|
| 218 |
+
if "stress" in query_lower and "school" in query_lower:
|
| 219 |
+
return "Academic stress can be managed through time management techniques and breaking large tasks into smaller, manageable steps."
|
| 220 |
+
elif "anxiety" in query_lower:
|
| 221 |
+
return "Anxiety can be helped through breathing techniques and grounding exercises that activate your body's relaxation response."
|
| 222 |
+
else:
|
| 223 |
+
return "There are proven strategies that can help you manage these feelings and improve your well-being over time."
|
| 224 |
+
|
| 225 |
+
# def _generate_recommendations_section(self, query: str, status: str) -> str:
|
| 226 |
+
# """Generate recommendations based on status and query."""
|
| 227 |
+
|
| 228 |
+
# if status == "Severe":
|
| 229 |
+
# return "I strongly recommend reaching out to a mental health professional immediately, and consider calling 988 if you need crisis support."
|
| 230 |
+
# elif "school" in query.lower():
|
| 231 |
+
# return "Consider speaking with a school counselor and practicing stress-reduction techniques like regular breaks and exercise."
|
| 232 |
+
# else:
|
| 233 |
+
# return "I recommend considering professional support and incorporating daily stress-reduction activities into your routine."
|
| 234 |
+
|
| 235 |
+
def _extract_sources(self, retrieved_docs: List[Any]) -> List[Dict]:
|
| 236 |
+
"""Extract sources from retrieved documents."""
|
| 237 |
+
sources = []
|
| 238 |
+
|
| 239 |
+
for i, doc in enumerate(retrieved_docs[:3]):
|
| 240 |
+
if hasattr(doc, 'page_content'):
|
| 241 |
+
content = doc.page_content
|
| 242 |
+
metadata = doc.metadata if hasattr(doc, 'metadata') else {}
|
| 243 |
+
source = metadata.get('source', f'Document {i+1}')
|
| 244 |
+
score = metadata.get('score', 0.5)
|
| 245 |
+
elif isinstance(doc, dict):
|
| 246 |
+
content = doc.get('content', doc.get('page_content', ''))
|
| 247 |
+
metadata = doc.get('metadata', {})
|
| 248 |
+
source = metadata.get('source', doc.get('source', f'Document {i+1}'))
|
| 249 |
+
score = doc.get('score', metadata.get('score', 0.5))
|
| 250 |
+
else:
|
| 251 |
+
content = str(doc)
|
| 252 |
+
source = f'Document {i+1}'
|
| 253 |
+
score = 0.5
|
| 254 |
+
|
| 255 |
+
if content:
|
| 256 |
+
sources.append({
|
| 257 |
+
"source": source,
|
| 258 |
+
"score": float(score),
|
| 259 |
+
"snippet": content[:100] + "..." if len(content) > 100 else content
|
| 260 |
+
})
|
| 261 |
+
|
| 262 |
+
return sources
|
| 263 |
+
|
| 264 |
+
def _build_structured_response(self, query: str, emotion: str, status: str) -> str:
|
| 265 |
+
"""Build guaranteed structured response (fallback method)."""
|
| 266 |
+
|
| 267 |
+
query_lower = query.lower()
|
| 268 |
+
|
| 269 |
+
print(f"[StructureBuilder] Building guaranteed structure for: {query_lower[:30]}...")
|
| 270 |
+
|
| 271 |
+
# 1. EMPATHY/ACKNOWLEDGMENT
|
| 272 |
+
if "sad" in query_lower and ("depressed" in query_lower or "depression" in query_lower):
|
| 273 |
+
empathy = "I understand you're going through a really difficult time with sadness and depression. These feelings can be overwhelming and exhausting, and I want you to know that reaching out shows real strength."
|
| 274 |
+
elif "stress" in query_lower and ("school" in query_lower or "work" in query_lower):
|
| 275 |
+
empathy = "I hear that you're feeling really stressed about your school/work responsibilities. Academic and work pressure can be overwhelming, and it's completely valid to feel this way."
|
| 276 |
+
elif "anxiety" in query_lower or "anxious" in query_lower:
|
| 277 |
+
empathy = "I understand that anxiety can feel incredibly overwhelming and scary. What you're experiencing is very real, and your struggle with this is completely valid."
|
| 278 |
+
else:
|
| 279 |
+
empathy = f"I hear that you're dealing with {emotion} feelings, and I want you to know that what you're experiencing is valid and understandable."
|
| 280 |
+
|
| 281 |
+
# 2. SOLUTION/INFORMATION
|
| 282 |
+
if "stress" in query_lower and ("school" in query_lower or "work" in query_lower):
|
| 283 |
+
solution = "Academic and work stress can be managed through time management techniques, breaking large tasks into smaller steps, and practicing stress-reduction activities. Research shows that regular breaks and boundary-setting can help you regain control."
|
| 284 |
+
elif "anxiety" in query_lower or "anxious" in query_lower:
|
| 285 |
+
solution = "Anxiety is highly treatable through various approaches including breathing techniques, grounding exercises, and cognitive strategies. Practice deep breathing (inhale for 4, hold for 4, exhale for 6) to help activate your body's relaxation response."
|
| 286 |
+
elif "sad" in query_lower and ("depressed" in query_lower or "depression" in query_lower):
|
| 287 |
+
solution = "Depression involves complex brain chemistry changes that affect mood, energy, and motivation. Research shows that combining professional support with self-care practices can help improve symptoms over time."
|
| 288 |
+
else:
|
| 289 |
+
solution = "There are proven strategies and techniques that can help you manage these feelings and improve your overall mental well-being through consistent practice and the right support."
|
| 290 |
+
|
| 291 |
+
# 3. RECOMMENDATIONS
|
| 292 |
+
if status == "Severe":
|
| 293 |
+
recommendations = "I strongly recommend reaching out to a mental health professional immediately for proper assessment and support. You can also call the crisis helpline at 988 if you need immediate assistance."
|
| 294 |
+
elif "school" in query_lower or "work" in query_lower:
|
| 295 |
+
recommendations = "Consider speaking with a counselor about stress management, practice setting boundaries with your workload, and explore stress-reduction activities like regular exercise or meditation that fit your schedule."
|
| 296 |
+
else:
|
| 297 |
+
recommendations = "Consider speaking with a mental health professional for personalized guidance and support. You might also try incorporating stress-reduction activities like deep breathing exercises, regular physical activity, or journaling into your routine."
|
| 298 |
+
|
| 299 |
+
final_response = f"{empathy} {solution} {recommendations}"
|
| 300 |
+
print(f"[StructureBuilder] Built guaranteed response with {len(final_response)} characters")
|
| 301 |
+
|
| 302 |
+
return final_response
|
| 303 |
+
|
| 304 |
+
def _calculate_confidence(self, sources: List[Dict[str, Any]]) -> float:
|
| 305 |
+
"""Calculate confidence based on sources."""
|
| 306 |
+
if not sources:
|
| 307 |
+
return 0.4
|
| 308 |
+
|
| 309 |
+
scores = [s.get('score', 0) for s in sources[:3]]
|
| 310 |
+
avg_score = sum(scores) / len(scores) if scores else 0.0
|
| 311 |
+
|
| 312 |
+
if len(sources) >= 3 and avg_score > 0.5:
|
| 313 |
+
return min(avg_score * 1.2, 1.0)
|
| 314 |
+
|
| 315 |
+
return max(avg_score, 0.4)
|
| 316 |
+
|
| 317 |
+
def _generate_guaranteed_structure(self, query: str, emotion: str, status: str, user_context: Dict) -> Dict[str, Any]:
|
| 318 |
+
"""Generate fallback response with guaranteed structure."""
|
| 319 |
+
|
| 320 |
+
structured_response = self._build_structured_response(query, emotion or "concerned", status or "Unknown")
|
| 321 |
+
|
| 322 |
+
return {
|
| 323 |
+
"response": structured_response,
|
| 324 |
+
"sources": [],
|
| 325 |
+
"confidence": 0.4
|
| 326 |
+
}
|
agents/rag_agent/vector_store.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional, Union
|
| 2 |
+
import logging
|
| 3 |
+
import uuid
|
| 4 |
+
import numpy as np
|
| 5 |
+
from qdrant_client import QdrantClient
|
| 6 |
+
from qdrant_client.http import models as qdrant_models
|
| 7 |
+
from qdrant_client.http.exceptions import UnexpectedResponse
|
| 8 |
+
|
| 9 |
+
# Import Document if available, otherwise define a minimal stub
|
| 10 |
+
try:
|
| 11 |
+
from langchain.schema import Document
|
| 12 |
+
except ImportError:
|
| 13 |
+
class Document:
|
| 14 |
+
def __init__(self, page_content, metadata):
|
| 15 |
+
self.page_content = page_content
|
| 16 |
+
self.metadata = metadata
|
| 17 |
+
|
| 18 |
+
class QdrantRetriever:
|
| 19 |
+
"""
|
| 20 |
+
Handles storage and retrieval of medical documents using Qdrant vector database.
|
| 21 |
+
"""
|
| 22 |
+
def __init__(self, config):
|
| 23 |
+
"""
|
| 24 |
+
Initialize the Qdrant retriever with configuration.
|
| 25 |
+
Args:
|
| 26 |
+
config: Configuration object containing Qdrant settings
|
| 27 |
+
"""
|
| 28 |
+
self.logger = logging.getLogger(__name__)
|
| 29 |
+
self.collection_name = config.rag.collection_name
|
| 30 |
+
self.embedding_dim = config.rag.embedding_dim
|
| 31 |
+
self.distance_metric = config.rag.distance_metric
|
| 32 |
+
|
| 33 |
+
# Force in-memory mode for now
|
| 34 |
+
self.client = QdrantClient(":memory:")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# # Initialize Qdrant client
|
| 38 |
+
# if getattr(config.rag, "use_local", True):
|
| 39 |
+
# self.client = QdrantClient(
|
| 40 |
+
# path=config.rag.local_path
|
| 41 |
+
# )
|
| 42 |
+
# else:
|
| 43 |
+
# self.client = QdrantClient(
|
| 44 |
+
# url=getattr(config.rag, "url", None),
|
| 45 |
+
# api_key=getattr(config.rag, "api_key", None),
|
| 46 |
+
# )
|
| 47 |
+
|
| 48 |
+
# Ensure collection exists
|
| 49 |
+
self._ensure_collection()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _ensure_collection(self):
|
| 54 |
+
"""Create collection if it doesn't exist."""
|
| 55 |
+
try:
|
| 56 |
+
collections = self.client.get_collections().collections
|
| 57 |
+
collection_names = [collection.name for collection in collections]
|
| 58 |
+
if self.collection_name not in collection_names:
|
| 59 |
+
self.logger.info(f"Creating new collection: {self.collection_name}")
|
| 60 |
+
self.client.create_collection(
|
| 61 |
+
collection_name=self.collection_name,
|
| 62 |
+
vectors_config=qdrant_models.VectorParams(
|
| 63 |
+
size=self.embedding_dim,
|
| 64 |
+
distance=self.distance_metric,
|
| 65 |
+
),
|
| 66 |
+
optimizers_config=qdrant_models.OptimizersConfigDiff(
|
| 67 |
+
indexing_threshold=10000,
|
| 68 |
+
),
|
| 69 |
+
)
|
| 70 |
+
self.logger.info(f"Collection {self.collection_name} created successfully")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
self.logger.error(f"Error creating collection: {e}")
|
| 73 |
+
raise
|
| 74 |
+
|
| 75 |
+
def upsert_documents(self, documents: List[Dict[str, Any]]):
|
| 76 |
+
"""
|
| 77 |
+
Insert or update documents in the vector database.
|
| 78 |
+
Args:
|
| 79 |
+
documents: List of document dictionaries containing:
|
| 80 |
+
- id: Unique identifier
|
| 81 |
+
- embedding: Vector embedding
|
| 82 |
+
- metadata: Document metadata
|
| 83 |
+
- content: Document content
|
| 84 |
+
"""
|
| 85 |
+
try:
|
| 86 |
+
points = []
|
| 87 |
+
for doc in documents:
|
| 88 |
+
points.append(
|
| 89 |
+
qdrant_models.PointStruct(
|
| 90 |
+
id=doc["id"],
|
| 91 |
+
vector=doc["embedding"],
|
| 92 |
+
payload={
|
| 93 |
+
"content": doc["content"],
|
| 94 |
+
"source": doc["metadata"].get("source", ""),
|
| 95 |
+
"specialty": doc["metadata"].get("specialty", ""),
|
| 96 |
+
"section": doc["metadata"].get("section", ""),
|
| 97 |
+
"publication_date": doc["metadata"].get("publication_date", ""),
|
| 98 |
+
"medical_entities": doc["metadata"].get("medical_entities", []),
|
| 99 |
+
"chunk_number": doc["metadata"].get("chunk_number", 0),
|
| 100 |
+
"total_chunks": doc["metadata"].get("total_chunks", 1),
|
| 101 |
+
}
|
| 102 |
+
)
|
| 103 |
+
)
|
| 104 |
+
self.client.upsert(
|
| 105 |
+
collection_name=self.collection_name,
|
| 106 |
+
points=points,
|
| 107 |
+
wait=True
|
| 108 |
+
)
|
| 109 |
+
self.logger.info(f"Successfully upserted {len(documents)} documents")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
self.logger.error(f"Error upserting documents: {e}")
|
| 112 |
+
raise
|
| 113 |
+
|
| 114 |
+
# Update the retrieve method to properly return Document objects:
|
| 115 |
+
def retrieve(self, query_embedding: np.ndarray, top_k: int = 5, **kwargs) -> List[Document]:
|
| 116 |
+
"""
|
| 117 |
+
Retrieve similar documents based on query embedding.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
query_embedding: Query vector
|
| 121 |
+
top_k: Number of results to return
|
| 122 |
+
**kwargs: Additional parameters (for compatibility)
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
List of Document objects
|
| 126 |
+
"""
|
| 127 |
+
try:
|
| 128 |
+
# Handle the case where query_embedding might be passed as a dict
|
| 129 |
+
if isinstance(query_embedding, dict):
|
| 130 |
+
# If it's a dict, it might be from query_processor
|
| 131 |
+
# Extract the actual embedding
|
| 132 |
+
if 'embedding' in query_embedding:
|
| 133 |
+
query_embedding = query_embedding['embedding']
|
| 134 |
+
else:
|
| 135 |
+
self.logger.error(f"Invalid query_embedding format: {type(query_embedding)}")
|
| 136 |
+
return []
|
| 137 |
+
|
| 138 |
+
# Ensure query_embedding is a list
|
| 139 |
+
if isinstance(query_embedding, np.ndarray):
|
| 140 |
+
query_vector = query_embedding.tolist()
|
| 141 |
+
else:
|
| 142 |
+
query_vector = list(query_embedding)
|
| 143 |
+
|
| 144 |
+
# Search in Qdrant
|
| 145 |
+
search_results = self.client.search(
|
| 146 |
+
collection_name=self.collection_name,
|
| 147 |
+
query_vector=query_vector,
|
| 148 |
+
limit=top_k
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Convert to Document objects
|
| 152 |
+
documents = []
|
| 153 |
+
for result in search_results:
|
| 154 |
+
# Extract content from payload
|
| 155 |
+
content = result.payload.get('content', '')
|
| 156 |
+
|
| 157 |
+
# Create metadata including the score
|
| 158 |
+
metadata = {k: v for k, v in result.payload.items() if k != 'content'}
|
| 159 |
+
metadata['score'] = result.score
|
| 160 |
+
metadata['id'] = str(result.id)
|
| 161 |
+
|
| 162 |
+
# Create Document object
|
| 163 |
+
doc = Document(
|
| 164 |
+
page_content=content,
|
| 165 |
+
metadata=metadata
|
| 166 |
+
)
|
| 167 |
+
documents.append(doc)
|
| 168 |
+
|
| 169 |
+
self.logger.info(f"Retrieved {len(documents)} documents for query")
|
| 170 |
+
return documents
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
self.logger.error(f"Error retrieving documents: {e}")
|
| 174 |
+
import traceback
|
| 175 |
+
self.logger.error(f"Traceback: {traceback.format_exc()}")
|
| 176 |
+
return []
|
| 177 |
+
|
| 178 |
+
def delete_documents(self, document_ids: List[Union[str, int]]):
|
| 179 |
+
"""
|
| 180 |
+
Delete documents from the vector database by their IDs.
|
| 181 |
+
Args:
|
| 182 |
+
document_ids: List of document IDs to delete
|
| 183 |
+
"""
|
| 184 |
+
try:
|
| 185 |
+
self.client.delete(
|
| 186 |
+
collection_name=self.collection_name,
|
| 187 |
+
points_selector=qdrant_models.PointIdsList(
|
| 188 |
+
points=document_ids
|
| 189 |
+
),
|
| 190 |
+
wait=True
|
| 191 |
+
)
|
| 192 |
+
self.logger.info(f"Successfully deleted {len(document_ids)} documents")
|
| 193 |
+
except Exception as e:
|
| 194 |
+
self.logger.error(f"Error deleting documents: {e}")
|
| 195 |
+
raise
|
| 196 |
+
|
| 197 |
+
def wipe_collection(self):
|
| 198 |
+
"""Completely remove and recreate the collection for fresh start."""
|
| 199 |
+
try:
|
| 200 |
+
self.client.delete_collection(self.collection_name)
|
| 201 |
+
self._ensure_collection()
|
| 202 |
+
self.logger.info(f"Collection {self.collection_name} wiped and recreated")
|
| 203 |
+
except Exception as e:
|
| 204 |
+
self.logger.error(f"Error wiping collection: {e}")
|
| 205 |
+
raise
|
| 206 |
+
|
| 207 |
+
# Add this method to the QdrantRetriever class:
|
| 208 |
+
|
| 209 |
+
def get_collection_info(self) -> Dict:
|
| 210 |
+
"""Get information about the collection."""
|
| 211 |
+
try:
|
| 212 |
+
collection = self.client.get_collection(self.collection_name)
|
| 213 |
+
return {
|
| 214 |
+
"name": self.collection_name,
|
| 215 |
+
"vectors_count": collection.vectors_count,
|
| 216 |
+
"points_count": collection.points_count,
|
| 217 |
+
"status": collection.status,
|
| 218 |
+
"config": {
|
| 219 |
+
"size": collection.config.params.vectors.size,
|
| 220 |
+
"distance": collection.config.params.vectors.distance
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
except Exception as e:
|
| 224 |
+
self.logger.error(f"Error getting collection info: {e}")
|
| 225 |
+
return {"error": str(e), "vectors_count": 0}
|
| 226 |
+
|
| 227 |
+
def get_collection_stats(self) -> Dict[str, Any]:
|
| 228 |
+
"""
|
| 229 |
+
Retrieve statistics of the collection.
|
| 230 |
+
Returns:
|
| 231 |
+
Dictionary containing collection statistics.
|
| 232 |
+
"""
|
| 233 |
+
try:
|
| 234 |
+
stats = self.client.get_collection(self.collection_name)
|
| 235 |
+
self.logger.info(f"Collection stats retrieved successfully: {stats}")
|
| 236 |
+
return stats.model_dump()
|
| 237 |
+
except Exception as e:
|
| 238 |
+
self.logger.error(f"Error getting collection stats: {e}")
|
| 239 |
+
raise
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
# Add these methods to the QdrantRetriever class:
|
| 243 |
+
|
| 244 |
+
def add_documents(self, documents: List[Document]) -> int:
|
| 245 |
+
"""
|
| 246 |
+
Add documents to the vector store.
|
| 247 |
+
|
| 248 |
+
Args:
|
| 249 |
+
documents: List of documents with embeddings
|
| 250 |
+
|
| 251 |
+
Returns:
|
| 252 |
+
Number of documents added
|
| 253 |
+
"""
|
| 254 |
+
if not documents:
|
| 255 |
+
return 0
|
| 256 |
+
|
| 257 |
+
points = []
|
| 258 |
+
for idx, doc in enumerate(documents):
|
| 259 |
+
if not hasattr(doc, 'metadata') or 'embedding' not in doc.metadata:
|
| 260 |
+
self.logger.warning(f"Document {idx} missing embedding, skipping")
|
| 261 |
+
continue
|
| 262 |
+
|
| 263 |
+
point_id = str(uuid.uuid4())
|
| 264 |
+
embedding = doc.metadata['embedding']
|
| 265 |
+
|
| 266 |
+
# Remove embedding from metadata before storing
|
| 267 |
+
metadata = {k: v for k, v in doc.metadata.items() if k != 'embedding'}
|
| 268 |
+
metadata['content'] = doc.page_content
|
| 269 |
+
|
| 270 |
+
points.append(
|
| 271 |
+
qdrant_models.PointStruct(
|
| 272 |
+
id=point_id,
|
| 273 |
+
vector=embedding,
|
| 274 |
+
payload=metadata
|
| 275 |
+
)
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
if points:
|
| 279 |
+
self.client.upsert(
|
| 280 |
+
collection_name=self.collection_name,
|
| 281 |
+
points=points
|
| 282 |
+
)
|
| 283 |
+
self.logger.info(f"Added {len(points)} documents to vector store")
|
| 284 |
+
|
| 285 |
+
return len(points)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def clear_collection(self):
|
| 290 |
+
"""Clear all documents from the collection."""
|
| 291 |
+
try:
|
| 292 |
+
# Delete and recreate the collection
|
| 293 |
+
self.client.delete_collection(self.collection_name)
|
| 294 |
+
self._create_collection()
|
| 295 |
+
self.logger.info(f"Collection {self.collection_name} cleared successfully")
|
| 296 |
+
except Exception as e:
|
| 297 |
+
self.logger.error(f"Error clearing collection: {e}")
|
| 298 |
+
raise
|
agents/shared_rag.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import threading
|
| 3 |
+
import time
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 6 |
+
from config.config import Config
|
| 7 |
+
from agents.rag_agent import MedicalRAG
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class LazyEmbeddingModel:
|
| 12 |
+
"""Lazy loading wrapper for SentenceTransformer model"""
|
| 13 |
+
def __init__(self, model_name="all-MiniLM-L6-v2"):
|
| 14 |
+
self.model_name = model_name
|
| 15 |
+
self._model = None
|
| 16 |
+
|
| 17 |
+
def _ensure_loaded(self):
|
| 18 |
+
if self._model is None:
|
| 19 |
+
logger.info(f"📊 Loading embedding model: {self.model_name}...")
|
| 20 |
+
self._model = SentenceTransformer(self.model_name)
|
| 21 |
+
logger.info("✅ Embedding model loaded successfully")
|
| 22 |
+
return self._model
|
| 23 |
+
|
| 24 |
+
def encode(self, *args, **kwargs):
|
| 25 |
+
model = self._ensure_loaded()
|
| 26 |
+
return model.encode(*args, **kwargs)
|
| 27 |
+
|
| 28 |
+
def __getattr__(self, name):
|
| 29 |
+
# Delegate all other attributes to the actual model
|
| 30 |
+
model = self._ensure_loaded()
|
| 31 |
+
return getattr(model, name)
|
| 32 |
+
|
| 33 |
+
class SharedRAG:
|
| 34 |
+
"""Singleton RAG instance for sharing across FastAPI and Flask backends."""
|
| 35 |
+
|
| 36 |
+
_instance = None
|
| 37 |
+
_lock = threading.Lock()
|
| 38 |
+
_initialized = False
|
| 39 |
+
_initialization_time = None
|
| 40 |
+
|
| 41 |
+
def __new__(cls):
|
| 42 |
+
if cls._instance is None:
|
| 43 |
+
with cls._lock:
|
| 44 |
+
if cls._instance is None:
|
| 45 |
+
cls._instance = super(SharedRAG, cls).__new__(cls)
|
| 46 |
+
return cls._instance
|
| 47 |
+
|
| 48 |
+
def __init__(self):
|
| 49 |
+
if not self._initialized:
|
| 50 |
+
current_time = time.time()
|
| 51 |
+
logger.info("🚀 Initializing shared RAG instance...")
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
# Initialize config
|
| 55 |
+
self.config = Config()
|
| 56 |
+
|
| 57 |
+
# Initialize models (lazy loading for memory optimization)
|
| 58 |
+
logger.info("📊 Embedding model will be loaded when needed...")
|
| 59 |
+
self.embedding_model = LazyEmbeddingModel("all-MiniLM-L6-v2")
|
| 60 |
+
|
| 61 |
+
logger.info("🤖 Loading LLM...")
|
| 62 |
+
self.llm = ChatGoogleGenerativeAI(
|
| 63 |
+
model="gemini-2.0-flash",
|
| 64 |
+
temperature=0.1,
|
| 65 |
+
max_tokens=1024
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Initialize RAG (will get embedding model when needed)
|
| 69 |
+
logger.info("📚 Initializing MedicalRAG...")
|
| 70 |
+
# Pass the lazy embedding model
|
| 71 |
+
self.rag = MedicalRAG(self.config, self.llm, self.embedding_model)
|
| 72 |
+
|
| 73 |
+
# Ensure knowledge is ingested
|
| 74 |
+
self._ensure_knowledge_ingested()
|
| 75 |
+
|
| 76 |
+
SharedRAG._initialized = True
|
| 77 |
+
SharedRAG._initialization_time = current_time
|
| 78 |
+
logger.info(f"✅ Shared RAG instance ready in {time.time() - current_time:.2f}s")
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"❌ Error initializing SharedRAG: {str(e)}")
|
| 82 |
+
raise e
|
| 83 |
+
|
| 84 |
+
def get_embedding_model(self):
|
| 85 |
+
"""Get the embedding model (lazy loading wrapper)"""
|
| 86 |
+
return self.embedding_model
|
| 87 |
+
|
| 88 |
+
def _ensure_knowledge_ingested(self):
|
| 89 |
+
"""Ensure knowledge base is populated."""
|
| 90 |
+
try:
|
| 91 |
+
# Check if collection has documents
|
| 92 |
+
collection_info = self.rag.retriever.get_collection_info()
|
| 93 |
+
existing_vectors = collection_info.get('points_count', 0)
|
| 94 |
+
|
| 95 |
+
logger.info(f"📋 Collection status: {existing_vectors} documents")
|
| 96 |
+
|
| 97 |
+
if existing_vectors == 0:
|
| 98 |
+
logger.info("📥 Empty collection, ingesting knowledge...")
|
| 99 |
+
result = self.rag.ingest_knowledge_folder("knowledge")
|
| 100 |
+
logger.info(f"✅ Ingestion complete: {result.get('successful', 0)} files processed")
|
| 101 |
+
else:
|
| 102 |
+
# Test retrieval with a simple query using lazy-loaded embedding model
|
| 103 |
+
logger.info("✅ Knowledge base ready (testing embedding on-demand)")
|
| 104 |
+
# The embedding model will load automatically when needed
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"❌ Error checking/ingesting knowledge: {e}")
|
| 108 |
+
import traceback
|
| 109 |
+
logger.error(traceback.format_exc())
|
| 110 |
+
|
| 111 |
+
def get_rag(self):
|
| 112 |
+
"""Get the RAG instance."""
|
| 113 |
+
current_time = time.time()
|
| 114 |
+
|
| 115 |
+
# Check if recently initialized (within 30 seconds)
|
| 116 |
+
if (self._initialized and
|
| 117 |
+
self._initialization_time and
|
| 118 |
+
(current_time - self._initialization_time) < 30):
|
| 119 |
+
logger.info(f"⚡ RAG ready ({current_time - self._initialization_time:.1f}s ago)")
|
| 120 |
+
|
| 121 |
+
return self.rag
|
| 122 |
+
|
| 123 |
+
def reingest_knowledge(self):
|
| 124 |
+
"""Force re-ingestion of knowledge base."""
|
| 125 |
+
try:
|
| 126 |
+
logger.info("🔄 Force reingesting knowledge...")
|
| 127 |
+
self.rag.retriever.clear_collection()
|
| 128 |
+
result = self.rag.ingest_knowledge_folder("knowledge")
|
| 129 |
+
logger.info(f"✅ Reingestion complete: {result}")
|
| 130 |
+
return result
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"❌ Error reingesting knowledge: {e}")
|
| 133 |
+
raise
|
| 134 |
+
|
| 135 |
+
def get_status(self):
|
| 136 |
+
"""Get system status for debugging."""
|
| 137 |
+
try:
|
| 138 |
+
collection_info = self.rag.retriever.get_collection_info()
|
| 139 |
+
return {
|
| 140 |
+
"initialized": self._initialized,
|
| 141 |
+
"initialization_time": self._initialization_time,
|
| 142 |
+
"collection_points": collection_info.get('points_count', 0),
|
| 143 |
+
"crewai_enabled": getattr(self.rag, 'crewai_enabled', False),
|
| 144 |
+
"embedding_model": str(type(self.embedding_model)),
|
| 145 |
+
"llm_model": str(type(self.llm))
|
| 146 |
+
}
|
| 147 |
+
except Exception as e:
|
| 148 |
+
return {"error": str(e)}
|
| 149 |
+
|
| 150 |
+
# Create singleton instance
|
| 151 |
+
shared_rag_instance = SharedRAG()
|
agents/web_search_processor_agent/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Dict, Any, Optional
|
| 2 |
+
from .web_search_processor import WebSearchProcessor
|
| 3 |
+
|
| 4 |
+
class WebSearchProcessorAgent:
|
| 5 |
+
"""
|
| 6 |
+
Agent responsible for processing web search results with structured responses.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, config=None):
|
| 10 |
+
self.web_search_processor = WebSearchProcessor(config)
|
| 11 |
+
|
| 12 |
+
def process_web_search_results(self, query: str, chat_history: Optional[List[Dict[str, str]]] = None,
|
| 13 |
+
user_context: Optional[Dict] = None) -> str:
|
| 14 |
+
"""
|
| 15 |
+
Process web search results and return structured response.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
query: User query
|
| 19 |
+
chat_history: Previous conversation history
|
| 20 |
+
user_context: User context (emotion, mental health status, etc.)
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Structured empathy + solution + recommendations response
|
| 24 |
+
"""
|
| 25 |
+
return self.web_search_processor.process_query(query, user_context)
|
agents/web_search_processor_agent/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1.87 kB). View file
|
|
|
agents/web_search_processor_agent/__pycache__/geminai_search.cpython-311.pyc
ADDED
|
Binary file (7.94 kB). View file
|
|
|
agents/web_search_processor_agent/__pycache__/pubmed_search.cpython-311.pyc
ADDED
|
Binary file (7.87 kB). View file
|
|
|
agents/web_search_processor_agent/__pycache__/tavily_search.cpython-311.pyc
ADDED
|
Binary file (2.48 kB). View file
|
|
|
agents/web_search_processor_agent/__pycache__/web_search_agent.cpython-311.pyc
ADDED
|
Binary file (6.62 kB). View file
|
|
|
agents/web_search_processor_agent/__pycache__/web_search_processor.cpython-311.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
agents/web_search_processor_agent/geminai_search.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Dict, Optional
|
| 3 |
+
import google.generativeai as genai
|
| 4 |
+
|
| 5 |
+
class GeminiSearchAgent:
|
| 6 |
+
"""
|
| 7 |
+
Searches for mental health information using Google's Gemini AI.
|
| 8 |
+
"""
|
| 9 |
+
def __init__(self):
|
| 10 |
+
"""Initialize the Gemini search agent."""
|
| 11 |
+
# Get API key from environment or use the one from your main.py
|
| 12 |
+
api_key = os.environ.get("GOOGLE_API_KEY", "AIzaSyDzBTzKt211XwMurywdk5HFCnFeeFxcRJ0")
|
| 13 |
+
genai.configure(api_key=api_key)
|
| 14 |
+
|
| 15 |
+
# Use gemini-1.5-flash which is currently available
|
| 16 |
+
self.model = genai.GenerativeModel('gemini-1.5-flash')
|
| 17 |
+
|
| 18 |
+
# List available models (for debugging)
|
| 19 |
+
try:
|
| 20 |
+
models = genai.list_models()
|
| 21 |
+
print("Available models:")
|
| 22 |
+
for model in models:
|
| 23 |
+
if 'generateContent' in model.supported_generation_methods:
|
| 24 |
+
print(f" - {model.name}")
|
| 25 |
+
except:
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
+
def search_mental_health(self, query: str, max_results: int = 5) -> List[Dict[str, str]]:
|
| 29 |
+
"""
|
| 30 |
+
Search for mental health information using Gemini AI.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
query: Search query string
|
| 34 |
+
max_results: Maximum number of results to return
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
List of dictionaries containing search results
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
# Enhance query for mental health context
|
| 41 |
+
enhanced_query = f"""
|
| 42 |
+
As a mental health information assistant, provide reliable information about: {query}
|
| 43 |
+
|
| 44 |
+
Focus on:
|
| 45 |
+
1. Evidence-based mental health resources
|
| 46 |
+
2. Professional medical sources and recent research
|
| 47 |
+
3. Practical advice and coping strategies
|
| 48 |
+
4. Treatment options and therapeutic approaches
|
| 49 |
+
|
| 50 |
+
Please provide specific, actionable information that would be helpful for someone seeking mental health support.
|
| 51 |
+
Format the response with clear sections if applicable.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
# Generate content
|
| 55 |
+
response = self.model.generate_content(enhanced_query)
|
| 56 |
+
|
| 57 |
+
# Extract the response text
|
| 58 |
+
main_response = response.text if response.text else "No response generated"
|
| 59 |
+
|
| 60 |
+
# Format as a list of results
|
| 61 |
+
results = [{
|
| 62 |
+
"title": "Mental Health Information",
|
| 63 |
+
"content": main_response,
|
| 64 |
+
"source": "Google Gemini AI",
|
| 65 |
+
"type": "ai_summary"
|
| 66 |
+
}]
|
| 67 |
+
|
| 68 |
+
return results
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"Error details: {str(e)}")
|
| 72 |
+
return [{
|
| 73 |
+
"title": "Error",
|
| 74 |
+
"content": f"Error retrieving information: {str(e)}",
|
| 75 |
+
"source": "Error",
|
| 76 |
+
"type": "error"
|
| 77 |
+
}]
|
| 78 |
+
|
| 79 |
+
def search_specific_topics(self, query: str, topics: List[str]) -> Dict[str, str]:
|
| 80 |
+
"""
|
| 81 |
+
Search for specific mental health topics.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
query: Base query
|
| 85 |
+
topics: List of specific topics to include
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
Dictionary with topic-specific information
|
| 89 |
+
"""
|
| 90 |
+
results = {}
|
| 91 |
+
|
| 92 |
+
for topic in topics:
|
| 93 |
+
specific_query = f"""
|
| 94 |
+
Provide brief, evidence-based information about: {query} specifically regarding {topic}.
|
| 95 |
+
Keep the response concise but informative, focusing on practical advice and current best practices.
|
| 96 |
+
"""
|
| 97 |
+
try:
|
| 98 |
+
response = self.model.generate_content(specific_query)
|
| 99 |
+
content = response.text if response.text else "No information available"
|
| 100 |
+
results[topic] = content
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
results[topic] = f"Error: {str(e)}"
|
| 104 |
+
|
| 105 |
+
return results
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# For backward compatibility with existing code
|
| 109 |
+
class PubmedSearchAgent(GeminiSearchAgent):
|
| 110 |
+
"""Alias for GeminiSearchAgent to maintain compatibility."""
|
| 111 |
+
|
| 112 |
+
def search_pubmed(self, query: str, max_results: int = 5) -> List[Dict[str, str]]:
|
| 113 |
+
"""
|
| 114 |
+
Wrapper method to maintain compatibility with existing code.
|
| 115 |
+
Redirects to Gemini search.
|
| 116 |
+
"""
|
| 117 |
+
# Get results from Gemini
|
| 118 |
+
gemini_results = self.search_mental_health(query, max_results)
|
| 119 |
+
|
| 120 |
+
# Transform to expected format
|
| 121 |
+
formatted_results = []
|
| 122 |
+
for i, result in enumerate(gemini_results):
|
| 123 |
+
formatted_results.append({
|
| 124 |
+
"title": result.get("title", "Information from Gemini AI"),
|
| 125 |
+
"abstract": result.get("content", "")[:500] + "..." if len(result.get("content", "")) > 500 else result.get("content", ""),
|
| 126 |
+
"url": "", # No direct URL with Gemini
|
| 127 |
+
"authors": "Google Gemini AI",
|
| 128 |
+
"publication_date": "Current",
|
| 129 |
+
"full_content": result.get("content", ""),
|
| 130 |
+
})
|
| 131 |
+
|
| 132 |
+
return formatted_results
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# Test function
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
# Test Gemini search
|
| 138 |
+
agent = GeminiSearchAgent()
|
| 139 |
+
|
| 140 |
+
print("\nTesting Gemini Search Agent...")
|
| 141 |
+
print("-" * 60)
|
| 142 |
+
|
| 143 |
+
# Test query
|
| 144 |
+
query = "cognitive behavioral therapy for depression"
|
| 145 |
+
results = agent.search_mental_health(query)
|
| 146 |
+
|
| 147 |
+
for result in results:
|
| 148 |
+
print(f"\nTitle: {result['title']}")
|
| 149 |
+
print(f"Source: {result['source']}")
|
| 150 |
+
print(f"Content: {result['content'][:500]}...")
|
| 151 |
+
|
| 152 |
+
# Test specific topics
|
| 153 |
+
print("\n" + "=" * 60)
|
| 154 |
+
print("Testing specific topics...")
|
| 155 |
+
topics_results = agent.search_specific_topics(
|
| 156 |
+
"anxiety management",
|
| 157 |
+
["breathing exercises", "medication options", "lifestyle changes"]
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
for topic, content in topics_results.items():
|
| 161 |
+
print(f"\n{topic.upper()}:")
|
| 162 |
+
print(content[:300] + "...")
|
agents/web_search_processor_agent/web_search_agent.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Add the parent directory to the path for imports
|
| 6 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
| 7 |
+
|
| 8 |
+
# Import both Gemini and PubMed agents
|
| 9 |
+
try:
|
| 10 |
+
from .geminai_search import GeminiSearchAgent, PubmedSearchAgent
|
| 11 |
+
except ImportError:
|
| 12 |
+
from agents.web_search_processor_agent.geminai_search import GeminiSearchAgent, PubmedSearchAgent
|
| 13 |
+
|
| 14 |
+
class WebSearchAgent:
|
| 15 |
+
"""
|
| 16 |
+
Agent responsible for retrieving real-time medical information from web sources.
|
| 17 |
+
Uses Gemini AI for general queries and maintains PubMed compatibility.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, config=None):
|
| 21 |
+
"""Initialize the web search agent."""
|
| 22 |
+
# Initialize both search agents
|
| 23 |
+
self.gemini_agent = GeminiSearchAgent()
|
| 24 |
+
self.pubmed_search_agent = PubmedSearchAgent() # This is actually GeminiSearchAgent too
|
| 25 |
+
self.config = config
|
| 26 |
+
|
| 27 |
+
def search(self, query: str) -> str:
|
| 28 |
+
"""
|
| 29 |
+
Perform searches using Gemini AI.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
query: Search query string
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Formatted search results
|
| 36 |
+
"""
|
| 37 |
+
print(f"[WebSearchAgent] Searching with Gemini for: {query}")
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
# Use Gemini search for better results
|
| 41 |
+
results = self.gemini_agent.search_mental_health(query)
|
| 42 |
+
|
| 43 |
+
# Format the results
|
| 44 |
+
formatted_results = self._format_gemini_results(results)
|
| 45 |
+
|
| 46 |
+
return formatted_results
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"[WebSearchAgent] Error during search: {str(e)}")
|
| 50 |
+
# Fallback to basic response
|
| 51 |
+
return self._get_fallback_response(query)
|
| 52 |
+
|
| 53 |
+
def _format_gemini_results(self, results: List[Dict[str, str]]) -> str:
|
| 54 |
+
"""Format Gemini results for display."""
|
| 55 |
+
if not results:
|
| 56 |
+
return "No relevant information found."
|
| 57 |
+
|
| 58 |
+
# Check for error results
|
| 59 |
+
if results[0].get("type") == "error":
|
| 60 |
+
return results[0].get("content", "Error occurred during search")
|
| 61 |
+
|
| 62 |
+
formatted_output = []
|
| 63 |
+
|
| 64 |
+
for result in results:
|
| 65 |
+
if result.get("type") == "ai_summary":
|
| 66 |
+
# Format AI-generated content
|
| 67 |
+
content = result.get("content", "")
|
| 68 |
+
formatted_output.append(content)
|
| 69 |
+
else:
|
| 70 |
+
# Format other types of results
|
| 71 |
+
title = result.get("title", "Information")
|
| 72 |
+
content = result.get("content", "")
|
| 73 |
+
source = result.get("source", "Unknown")
|
| 74 |
+
|
| 75 |
+
formatted_output.append(f"**{title}**\n*Source: {source}*\n\n{content}")
|
| 76 |
+
|
| 77 |
+
return "\n\n".join(formatted_output)
|
| 78 |
+
|
| 79 |
+
def _get_fallback_response(self, query: str) -> str:
|
| 80 |
+
"""Provide a fallback response when search fails."""
|
| 81 |
+
return f"""I apologize, but I couldn't retrieve specific information about "{query}" at this moment.
|
| 82 |
+
|
| 83 |
+
Here are some general suggestions:
|
| 84 |
+
1. Consider consulting with a mental health professional
|
| 85 |
+
2. Visit reputable mental health websites like NIMH or WHO
|
| 86 |
+
3. Speak with your healthcare provider for personalized advice
|
| 87 |
+
|
| 88 |
+
Would you like to rephrase your question or ask about something else?"""
|
| 89 |
+
|
| 90 |
+
def search_mental_health_specific(self, query: str) -> str:
|
| 91 |
+
"""
|
| 92 |
+
Search specifically for mental health related content.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
query: Search query string
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Formatted search results with mental health focus
|
| 99 |
+
"""
|
| 100 |
+
# The GeminiSearchAgent already adds mental health context
|
| 101 |
+
return self.search(query)
|
| 102 |
+
|
| 103 |
+
def search_with_topics(self, query: str, topics: List[str]) -> str:
|
| 104 |
+
"""
|
| 105 |
+
Search for specific topics using Gemini.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
query: Base query
|
| 109 |
+
topics: List of specific topics
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
Formatted results by topic
|
| 113 |
+
"""
|
| 114 |
+
try:
|
| 115 |
+
results = self.gemini_agent.search_specific_topics(query, topics)
|
| 116 |
+
|
| 117 |
+
formatted = [f"**Information about {query}:**\n"]
|
| 118 |
+
for topic, content in results.items():
|
| 119 |
+
formatted.append(f"\n### {topic.title()}\n{content}")
|
| 120 |
+
|
| 121 |
+
return "\n".join(formatted)
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"[WebSearchAgent] Error in topic search: {str(e)}")
|
| 125 |
+
return self._get_fallback_response(query)
|
agents/web_search_processor_agent/web_search_processor.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Dict, List, Optional
|
| 3 |
+
|
| 4 |
+
try:
|
| 5 |
+
from .web_search_agent import WebSearchAgent
|
| 6 |
+
except ImportError:
|
| 7 |
+
from agents.web_search_processor_agent.web_search_agent import WebSearchAgent
|
| 8 |
+
|
| 9 |
+
class WebSearchProcessor:
|
| 10 |
+
"""
|
| 11 |
+
Processes web search results using structured empathy + solution + recommendations format.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, config=None):
|
| 15 |
+
self.config = config
|
| 16 |
+
try:
|
| 17 |
+
self.web_search_agent = WebSearchAgent()
|
| 18 |
+
except Exception as e:
|
| 19 |
+
print(f"⚠️ Web search agent not available: {e}")
|
| 20 |
+
self.web_search_agent = None
|
| 21 |
+
|
| 22 |
+
def process_query(self, query: str, user_context: Optional[Dict] = None) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Process query with structured empathy + solution + recommendations format.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
query: User query
|
| 28 |
+
user_context: User context including emotion, mental_health_status, name
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Structured response string
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
query_lower = query.lower()
|
| 35 |
+
user_context = user_context or {}
|
| 36 |
+
|
| 37 |
+
emotion = user_context.get("emotion", "neutral")
|
| 38 |
+
mental_health_status = user_context.get("mental_health_status", "Unknown")
|
| 39 |
+
user_name = user_context.get("name", "there")
|
| 40 |
+
|
| 41 |
+
# Generate structured response based on query content
|
| 42 |
+
if any(word in query_lower for word in ["depressed", "depression", "sad"]):
|
| 43 |
+
empathy = f"I can hear that you're going through a really difficult time with sadness and depression, {user_name}. Those feelings can be overwhelming and exhausting, and I want you to know that what you're experiencing is completely valid."
|
| 44 |
+
solution = "Depression often involves changes in brain chemistry that affect mood, energy, and motivation. Professional treatment through therapy and/or medication has proven very effective for most people experiencing depression."
|
| 45 |
+
recommendations = "I strongly recommend reaching out to a mental health professional who can provide proper assessment and treatment. In the meantime, try to maintain basic self-care routines, connect with supportive people in your life, and consider contacting the National Mental Health Program at 1717 if you need immediate support."
|
| 46 |
+
|
| 47 |
+
elif any(word in query_lower for word in ["anxious", "anxiety", "worried", "panic"]):
|
| 48 |
+
empathy = f"I understand that anxiety can feel incredibly overwhelming and scary, {user_name}. Those racing thoughts and physical sensations are very real and can be exhausting to deal with."
|
| 49 |
+
solution = "Anxiety is one of the most treatable mental health conditions. Techniques like deep breathing, grounding exercises, and cognitive behavioral therapy have strong research support for managing anxiety symptoms."
|
| 50 |
+
recommendations = "Try practicing 4-7-8 breathing (inhale 4, hold 7, exhale 8), limit caffeine intake, and consider speaking with a counselor who specializes in anxiety treatment. Regular exercise and mindfulness can also significantly help."
|
| 51 |
+
|
| 52 |
+
elif any(word in query_lower for word in ["stress", "overwhelm", "pressure"]):
|
| 53 |
+
empathy = f"It sounds like you're dealing with a lot of stress right now, {user_name}. That pressure can really take a toll on your mental and physical well-being."
|
| 54 |
+
solution = "Stress management involves both addressing the source of stress and building your capacity to handle pressure. Identifying specific stressors and developing coping strategies can make a significant difference."
|
| 55 |
+
recommendations = "Try breaking down overwhelming tasks into smaller steps, practice saying no to additional commitments, and schedule regular breaks. Consider stress-reduction techniques like meditation, exercise, or talking to a therapist."
|
| 56 |
+
|
| 57 |
+
elif any(word in query_lower for word in ["lonely", "alone", "isolated"]):
|
| 58 |
+
empathy = f"I hear that you're feeling lonely, {user_name}. Loneliness can be very difficult to experience, and you're showing strength by reaching out."
|
| 59 |
+
solution = "Loneliness is a common human experience that can be addressed through building meaningful connections and developing a support network."
|
| 60 |
+
recommendations = "Consider joining community groups, volunteering, or participating in activities you enjoy. Online support groups can also provide connection. If loneliness persists, talking to a counselor can help develop strategies for building relationships."
|
| 61 |
+
|
| 62 |
+
elif any(word in query_lower for word in ["sleep", "tired", "exhausted", "insomnia"]):
|
| 63 |
+
empathy = f"It sounds like you're having difficulties with sleep or feeling tired, {user_name}. Sleep issues can significantly impact mental health and daily functioning."
|
| 64 |
+
solution = "Sleep problems often have both physical and mental health components. Good sleep hygiene and addressing underlying stress or anxiety can improve sleep quality."
|
| 65 |
+
recommendations = "Try maintaining a consistent sleep schedule, limiting screen time before bed, and creating a relaxing bedtime routine. Avoid caffeine late in the day. If sleep problems persist, consider consulting a healthcare provider."
|
| 66 |
+
|
| 67 |
+
else:
|
| 68 |
+
# Generic structured response
|
| 69 |
+
empathy = f"Thank you for reaching out and sharing what you're going through, {user_name}. I can sense that you're dealing with some challenges right now, and I want you to know that your experience matters."
|
| 70 |
+
solution = "There are evidence-based strategies and resources available that can help you manage these feelings and improve your mental health over time."
|
| 71 |
+
recommendations = "Consider speaking with a mental health professional for personalized support, practice daily self-care activities, and don't hesitate to reach out to trusted friends or family members for additional support."
|
| 72 |
+
|
| 73 |
+
# Combine the structured response
|
| 74 |
+
structured_response = f"{empathy}\n\n{solution}\n\n{recommendations}"
|
| 75 |
+
|
| 76 |
+
# Add crisis resources for severe cases
|
| 77 |
+
if (mental_health_status in ["Severe", "Crisis"] or
|
| 78 |
+
any(word in query_lower for word in ["suicide", "kill myself", "want to die", "hurt myself"])):
|
| 79 |
+
crisis_addition = "\n\n**🆘 Immediate Support Available:** If you're having thoughts of self-harm, please contact the National Mental Health Program at 1717 (24/7) or Emergency Services at 112 immediately."
|
| 80 |
+
structured_response += crisis_addition
|
| 81 |
+
|
| 82 |
+
return structured_response
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"[WebSearchProcessor] Error: {e}")
|
| 86 |
+
return f"I'm here to support you, {user_context.get('name', 'there')}, though I'm having some technical difficulties. Please know that whatever you're going through, there are people and resources available to help. For immediate support in Bhutan, contact the National Mental Health Program at 1717."
|
| 87 |
+
|
| 88 |
+
def process_web_results(self, query: str, chat_history: Optional[List[Dict[str, str]]] = None) -> str:
|
| 89 |
+
"""
|
| 90 |
+
Legacy method for compatibility.
|
| 91 |
+
"""
|
| 92 |
+
# Build context from chat history
|
| 93 |
+
user_context = {}
|
| 94 |
+
if chat_history and len(chat_history) > 0:
|
| 95 |
+
# Extract user context from recent messages
|
| 96 |
+
recent_messages = chat_history[-3:] if len(chat_history) > 3 else chat_history
|
| 97 |
+
for msg in recent_messages:
|
| 98 |
+
if msg.get("role") == "user":
|
| 99 |
+
content = msg.get('content', '').lower()
|
| 100 |
+
# Simple emotion detection
|
| 101 |
+
if any(word in content for word in ['sad', 'depressed', 'down']):
|
| 102 |
+
user_context['emotion'] = 'sad'
|
| 103 |
+
elif any(word in content for word in ['anxious', 'worried', 'panic']):
|
| 104 |
+
user_context['emotion'] = 'anxious'
|
| 105 |
+
elif any(word in content for word in ['angry', 'frustrated', 'mad']):
|
| 106 |
+
user_context['emotion'] = 'angry'
|
| 107 |
+
|
| 108 |
+
return self.process_query(query, user_context)
|
| 109 |
+
|
| 110 |
+
# Convenience function for direct use
|
| 111 |
+
def search_mental_health_info(query: str, user_context: Optional[Dict] = None) -> str:
|
| 112 |
+
"""
|
| 113 |
+
Direct function to search for mental health information.
|
| 114 |
+
"""
|
| 115 |
+
processor = WebSearchProcessor()
|
| 116 |
+
return processor.process_query(query, user_context)
|
chat_sessions/chat_Harsh Narayan_20250802_165208.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"session_id": "20250802_165208",
|
| 3 |
+
"user_name": "Harsh Narayan",
|
| 4 |
+
"messages": [
|
| 5 |
+
{
|
| 6 |
+
"content": "Now I'm upset.",
|
| 7 |
+
"role": "user",
|
| 8 |
+
"timestamp": "2025-08-02T16:52:08.426564",
|
| 9 |
+
"topics": []
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"agent": "RAG_AGENT",
|
| 13 |
+
"content": "I understand you're feeling upset right now. It's completely valid to experience a range of emotions, and I'm here to listen without judgment. It takes courage to acknowledge when you're feeling this way.\n\nResearch shows that sometimes upsetting feelings can arise from internal thoughts and judgments, as one source mentions getting angry and critical with oneself. It can help to practice simply listening to the voice of your thoughts without immediately judging them or trying to shut them down. This can create some space between you and the upsetting feelings.\n\nI recommend considering what might have triggered this feeling and if you can identify any specific thoughts or situations connected to it. If these feelings persist or become overwhelming, I suggest reaching out to a mental health professional for further support. Remember, taking care of your mental well-being is essential.\n",
|
| 14 |
+
"role": "assistant",
|
| 15 |
+
"timestamp": "2025-08-02T16:52:12.085676",
|
| 16 |
+
"topics": [
|
| 17 |
+
"depression"
|
| 18 |
+
]
|
| 19 |
+
}
|
| 20 |
+
],
|
| 21 |
+
"metadata": {
|
| 22 |
+
"topics": []
|
| 23 |
+
}
|
| 24 |
+
}
|
chat_sessions/chat_Harsh Narayan_20250804_163618.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"session_id": "20250804_163618",
|
| 3 |
+
"user_name": "Harsh Narayan",
|
| 4 |
+
"messages": [
|
| 5 |
+
{
|
| 6 |
+
"content": "I am failing sad, what should I do?",
|
| 7 |
+
"role": "user",
|
| 8 |
+
"timestamp": "2025-08-04T16:36:18.921585",
|
| 9 |
+
"topics": [
|
| 10 |
+
"depression"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"agent": "RAG_AGENT",
|
| 15 |
+
"content": "I understand you're feeling sad about failing, and that's a really tough feeling to experience. It's completely valid to feel down when things don't go as planned. I'm here to offer support.\n\nRemembering the opposite of your negative feelings can help; if things feel unpleasant, try to focus on peace and relaxation, letting go of the need to do anything. Research shows that being mindful of when you get carried away by negative thoughts and stopping yourself can also be beneficial. Living each day with the intention of bringing joy and kindness can also help shift your perspective.\n\nI recommend considering practicing these techniques when you feel overwhelmed. Since I don't know your full situation, I suggest reaching out to a mental health professional for personalized support and guidance. They can help you explore these feelings further and develop coping strategies.\n",
|
| 16 |
+
"role": "assistant",
|
| 17 |
+
"timestamp": "2025-08-04T16:36:21.540544",
|
| 18 |
+
"topics": [
|
| 19 |
+
"depression",
|
| 20 |
+
"coping"
|
| 21 |
+
]
|
| 22 |
+
}
|
| 23 |
+
],
|
| 24 |
+
"metadata": {
|
| 25 |
+
"topics": []
|
| 26 |
+
}
|
| 27 |
+
}
|
chat_sessions/chat_string_string.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"session_id": "string",
|
| 3 |
+
"user_name": "string",
|
| 4 |
+
"messages": [
|
| 5 |
+
{}
|
| 6 |
+
],
|
| 7 |
+
"metadata": {}
|
| 8 |
+
}
|
config/__init__.py
ADDED
|
File without changes
|
config/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (186 Bytes). View file
|
|
|
config/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (8.95 kB). View file
|
|
|
config/agents.yaml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
emotion_detector:
|
| 2 |
+
role: Emotion Detector
|
| 3 |
+
goal: Analyze user input to determine their emotional state.
|
| 4 |
+
backstory: You are an empathetic AI skilled at identifying emotions like sadness, anxiety, or happiness from text.
|
| 5 |
+
memory: true
|
| 6 |
+
verbose: false
|
| 7 |
+
|
| 8 |
+
suggestion_agent:
|
| 9 |
+
role: Suggestion Provider
|
| 10 |
+
goal: Offer 1-2 brief and actionable suggestions to support the user's mental well-being.
|
| 11 |
+
backstory: You are a compassionate advisor who shares short mental health tips like breathing exercises, journaling, but also offers to gently guide the user through them if they seem open to it.
|
| 12 |
+
memory: true
|
| 13 |
+
verbose: false
|
| 14 |
+
|
| 15 |
+
safety_agent:
|
| 16 |
+
role: Safety Monitor
|
| 17 |
+
goal: Check user input for signs of crisis or self-harm and respond appropriately.
|
| 18 |
+
backstory: You are a vigilant guardian trained to detect red flags like mentions of self-harm, prioritizing user safety.
|
| 19 |
+
memory: true
|
| 20 |
+
verbose: false
|
| 21 |
+
|
| 22 |
+
orchestrator:
|
| 23 |
+
role: Workflow Orchestrator
|
| 24 |
+
goal: Coordinate agent tasks to ensure a cohesive and empathetic response, prioritizing safety if needed.
|
| 25 |
+
backstory: You are a strategic coordinator ensuring the chatbot responds with clarity and empathy, tailoring response length naturally.
|
| 26 |
+
memory: true
|
| 27 |
+
verbose: false
|
| 28 |
+
|
| 29 |
+
rag_retriever:
|
| 30 |
+
role: Knowledge Retriever
|
| 31 |
+
goal: Retrieve relevant mental health tips from the text corpus based on the user's emotional state.
|
| 32 |
+
backstory: You are a librarian who finds concise, relevant resources using a custom RAG pipeline.
|
| 33 |
+
memory: true
|
| 34 |
+
verbose: false
|
| 35 |
+
|
| 36 |
+
rag_reader:
|
| 37 |
+
role: Knowledge Summarizer
|
| 38 |
+
goal: Summarize retrieved tips clearly and briefly for the user's emotional state.
|
| 39 |
+
backstory: You are an editor who distills helpful advice into brief, clear summaries.
|
| 40 |
+
memory: true
|
| 41 |
+
verbose: false
|
| 42 |
+
|
config/config.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration loader for the Mental Health Chatbot
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import yaml
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Any, Dict, Optional
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 12 |
+
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 13 |
+
from sentence_transformers import SentenceTransformer
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass
|
| 17 |
+
class RAGConfig:
|
| 18 |
+
"""Configuration for RAG agent"""
|
| 19 |
+
def __init__(self, config_dict: Dict[str, Any]):
|
| 20 |
+
self.config_dict = config_dict
|
| 21 |
+
self.llm = ChatGoogleGenerativeAI(
|
| 22 |
+
model="gemini-1.5-flash",
|
| 23 |
+
temperature=0.3,
|
| 24 |
+
google_api_key=os.environ.get("GOOGLE_API_KEY")
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Load from YAML if available
|
| 28 |
+
if 'rag' in config_dict:
|
| 29 |
+
rag_config = config_dict['rag']
|
| 30 |
+
self.embedding_dim = rag_config.get('embedding_dim', 384)
|
| 31 |
+
|
| 32 |
+
# Lazy load SentenceTransformer for memory optimization
|
| 33 |
+
self.embedding_model = None
|
| 34 |
+
self.embedding_model_name = "all-MiniLM-L6-v2"
|
| 35 |
+
|
| 36 |
+
self.collection_name = rag_config.get('collection_name', 'mental_health_docs')
|
| 37 |
+
self.chunk_size = rag_config.get('chunk_size', 256)
|
| 38 |
+
self.chunk_overlap = rag_config.get('chunk_overlap', 32)
|
| 39 |
+
self.reranker_model = rag_config.get('reranker_model', 'cross-encoder/ms-marco-MiniLM-L-6-v2')
|
| 40 |
+
self.reranker_top_k = rag_config.get('reranker_top_k', 5)
|
| 41 |
+
self.max_context_length = rag_config.get('max_context_length', 2048)
|
| 42 |
+
self.include_sources = rag_config.get('include_sources', True)
|
| 43 |
+
self.use_local = rag_config.get('use_local', True)
|
| 44 |
+
self.url = rag_config.get('url', 'http://localhost:6333')
|
| 45 |
+
self.distance_metric = rag_config.get('distance_metric', 'Cosine') # Changed to 'Cosine'
|
| 46 |
+
self.min_retrieval_confidence = rag_config.get('min_retrieval_confidence', 0.85)
|
| 47 |
+
|
| 48 |
+
# Add missing attributes
|
| 49 |
+
self.processed_docs_dir = rag_config.get('processed_docs_dir', 'processed_docs')
|
| 50 |
+
self.knowledge_dir = rag_config.get('knowledge_dir', 'knowledge')
|
| 51 |
+
else:
|
| 52 |
+
# Default values if no YAML config
|
| 53 |
+
self.embedding_dim = 384
|
| 54 |
+
self.embedding_model = None
|
| 55 |
+
self.embedding_model_name = "all-MiniLM-L6-v2"
|
| 56 |
+
self.collection_name = 'mental_health_docs'
|
| 57 |
+
self.chunk_size = 256
|
| 58 |
+
self.chunk_overlap = 32
|
| 59 |
+
self.reranker_model = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
|
| 60 |
+
self.reranker_top_k = 5
|
| 61 |
+
self.max_context_length = 2048
|
| 62 |
+
self.include_sources = True
|
| 63 |
+
self.use_local = True
|
| 64 |
+
self.url = 'http://localhost:6333'
|
| 65 |
+
self.distance_metric = 'Cosine' # Changed to 'Cosine'
|
| 66 |
+
self.min_retrieval_confidence = 0.85
|
| 67 |
+
self.processed_docs_dir = 'processed_docs'
|
| 68 |
+
self.knowledge_dir = 'knowledge'
|
| 69 |
+
|
| 70 |
+
self.context_limit = 4
|
| 71 |
+
|
| 72 |
+
def get_embedding_model(self):
|
| 73 |
+
"""Lazy load the embedding model only when needed"""
|
| 74 |
+
if self.embedding_model is None:
|
| 75 |
+
try:
|
| 76 |
+
from sentence_transformers import SentenceTransformer
|
| 77 |
+
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
| 78 |
+
print(f"✅ Embedding model loaded: {self.embedding_model_name}")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"⚠️ Failed to load embedding model: {e}")
|
| 81 |
+
return None
|
| 82 |
+
return self.embedding_model
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@dataclass
|
| 86 |
+
class ConversationConfig:
|
| 87 |
+
"""Configuration for Conversation agent"""
|
| 88 |
+
def __init__(self):
|
| 89 |
+
self.llm = ChatGoogleGenerativeAI(
|
| 90 |
+
model="gemini-1.5-flash",
|
| 91 |
+
temperature=0.7,
|
| 92 |
+
google_api_key=os.environ.get("GOOGLE_API_KEY")
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@dataclass
|
| 97 |
+
class WebSearchConfig:
|
| 98 |
+
"""Configuration for Web Search agent"""
|
| 99 |
+
def __init__(self):
|
| 100 |
+
self.context_limit = 4
|
| 101 |
+
self.llm = ChatGoogleGenerativeAI(
|
| 102 |
+
model="gemini-1.5-flash",
|
| 103 |
+
temperature=0.5,
|
| 104 |
+
google_api_key=os.environ.get("GOOGLE_API_KEY")
|
| 105 |
+
)
|
| 106 |
+
# Add Tavily API key configuration
|
| 107 |
+
self.tavily_api_key = os.environ.get("TAVILY_API_KEY", "tvly-your-api-key-here")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@dataclass
|
| 111 |
+
class AgentDecisionConfig:
|
| 112 |
+
"""Configuration for Agent Decision system"""
|
| 113 |
+
def __init__(self):
|
| 114 |
+
self.llm = ChatGoogleGenerativeAI(
|
| 115 |
+
model="gemini-2.0-flash",
|
| 116 |
+
temperature=0,
|
| 117 |
+
google_api_key=os.environ.get("GOOGLE_API_KEY")
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class Config:
|
| 122 |
+
"""Main configuration class that loads from YAML files"""
|
| 123 |
+
|
| 124 |
+
def __init__(self):
|
| 125 |
+
# Set API keys
|
| 126 |
+
os.environ["GOOGLE_API_KEY"] = "AIzaSyDzBTzKt211XwMurywdk5HFCnFeeFxcRJ0"
|
| 127 |
+
os.environ["TAVILY_API_KEY"] = "tvly-your-api-key-here" # You need to replace this
|
| 128 |
+
|
| 129 |
+
# Load YAML configurations
|
| 130 |
+
self.config_dict = self._load_yaml_configs()
|
| 131 |
+
|
| 132 |
+
# Initialize configurations
|
| 133 |
+
self.rag = RAGConfig(self.config_dict)
|
| 134 |
+
self.conversation = ConversationConfig()
|
| 135 |
+
self.web_search = WebSearchConfig()
|
| 136 |
+
self.agent_decision = AgentDecisionConfig()
|
| 137 |
+
|
| 138 |
+
# General settings
|
| 139 |
+
self.max_conversation_history = 20
|
| 140 |
+
|
| 141 |
+
def _load_yaml_configs(self) -> Dict[str, Any]:
|
| 142 |
+
"""Load all YAML configuration files"""
|
| 143 |
+
config_dict = {}
|
| 144 |
+
config_dir = Path(__file__).parent
|
| 145 |
+
|
| 146 |
+
# Load each YAML file
|
| 147 |
+
yaml_files = ['agents.yaml', 'rag.yaml', 'tasks.yaml']
|
| 148 |
+
for yaml_file in yaml_files:
|
| 149 |
+
file_path = config_dir / yaml_file
|
| 150 |
+
if file_path.exists():
|
| 151 |
+
with open(file_path, 'r') as f:
|
| 152 |
+
data = yaml.safe_load(f)
|
| 153 |
+
if data:
|
| 154 |
+
config_dict.update(data)
|
| 155 |
+
|
| 156 |
+
return config_dict
|
| 157 |
+
|
| 158 |
+
def get_agent_config(self, agent_name: str) -> Dict[str, Any]:
|
| 159 |
+
"""Get configuration for a specific agent"""
|
| 160 |
+
return self.config_dict.get(agent_name, {})
|
| 161 |
+
|
| 162 |
+
def get_task_config(self, task_name: str) -> Dict[str, Any]:
|
| 163 |
+
"""Get configuration for a specific task"""
|
| 164 |
+
return self.config_dict.get(task_name, {})
|
config/rag.yaml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rag:
|
| 2 |
+
collection_name: "mental_health_docs"
|
| 3 |
+
embedding_model: "all-MiniLM-L6-v2"
|
| 4 |
+
embedding_dim: 384
|
| 5 |
+
chunk_size: 256
|
| 6 |
+
chunk_overlap: 32
|
| 7 |
+
processed_docs_dir: "processed_docs"
|
| 8 |
+
knowledge_dir: "knowledge"
|
| 9 |
+
|
| 10 |
+
# Qdrant settings
|
| 11 |
+
use_local: true
|
| 12 |
+
url: "http://localhost:6333"
|
| 13 |
+
distance_metric: "Cosine"
|
| 14 |
+
|
| 15 |
+
# Optimized reranker settings for speed
|
| 16 |
+
reranker_model: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 17 |
+
reranker_top_k: 3 # Reduced from 5 to 3 for faster processing
|
| 18 |
+
|
| 19 |
+
# Response generation
|
| 20 |
+
max_context_length: 1024 # Reduced from 2048 for faster processing
|
| 21 |
+
response_format_instructions: ""
|
| 22 |
+
include_sources: true
|
| 23 |
+
min_retrieval_confidence: 0.5 # Increased from 0.4 for better quality/speed trade-off
|
| 24 |
+
top_k: 3 # Reduced from 5 for faster retrieval
|
| 25 |
+
max_context_docs: 3 # Reduced for faster processing
|
| 26 |
+
|
config/tasks.yaml
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
emotion_detection_task:
|
| 2 |
+
description: >
|
| 3 |
+
Analyze the user's input: "{user_input}"
|
| 4 |
+
Identify their primary emotion (e.g., sadness, anxiety, neutral) and provide a short emotional summary.
|
| 5 |
+
expected_output: >
|
| 6 |
+
A short emotion label, e.g., "User feels anxious."
|
| 7 |
+
output_key: emotion_summary
|
| 8 |
+
agent: emotion_detector
|
| 9 |
+
|
| 10 |
+
safety_check_task:
|
| 11 |
+
description: >
|
| 12 |
+
Review the user's message and conversation context.
|
| 13 |
+
Determine whether it contains signs of emotional distress or crisis using your mental health knowledge base.
|
| 14 |
+
Respond in JSON format:
|
| 15 |
+
{
|
| 16 |
+
"distress": true/false,
|
| 17 |
+
"trigger": "quoted text from user input if applicable"
|
| 18 |
+
}
|
| 19 |
+
expected_output: >
|
| 20 |
+
Structured JSON with distress flag and trigger phrase if any.
|
| 21 |
+
output_key: safety_result
|
| 22 |
+
agent: safety_agent
|
| 23 |
+
context:
|
| 24 |
+
- emotion_detection_task
|
| 25 |
+
|
| 26 |
+
rag_retrieval_task:
|
| 27 |
+
description: >
|
| 28 |
+
Based on the user's emotional state, retrieve concise mental health tips from the corpus.
|
| 29 |
+
expected_output: >
|
| 30 |
+
1–2 short tips, e.g., "Take a walk in nature. Try deep breathing exercises."
|
| 31 |
+
output_key: retrieved_tips
|
| 32 |
+
agent: rag_retriever
|
| 33 |
+
context:
|
| 34 |
+
- emotion_detection_task
|
| 35 |
+
|
| 36 |
+
rag_reading_task:
|
| 37 |
+
description: >
|
| 38 |
+
Summarize the tips retrieved in a helpful and brief tone.
|
| 39 |
+
expected_output: >
|
| 40 |
+
A 1-2 line practical summary of the tips.
|
| 41 |
+
output_key: summarized_tips
|
| 42 |
+
agent: rag_reader
|
| 43 |
+
context:
|
| 44 |
+
- emotion_detection_task
|
| 45 |
+
- rag_retrieval_task
|
| 46 |
+
|
| 47 |
+
suggestion_task:
|
| 48 |
+
description: >
|
| 49 |
+
Give 1-2 practical, empathetic suggestions based on emotional state and retrieved tips.
|
| 50 |
+
If appropriate, gently offer further help such as:
|
| 51 |
+
"Would you like me to walk you through a breathing exercise?" or
|
| 52 |
+
"Shall I guide you with some mindfulness steps?"
|
| 53 |
+
expected_output: >
|
| 54 |
+
1-2 short actionable suggestions (1-2 lines), optionally followed by a gentle offer for guided support.
|
| 55 |
+
E.g., "Try a short breathing exercise. Take a break from screens. Would you like me to walk you through one?"
|
| 56 |
+
output_key: suggestions
|
| 57 |
+
agent: suggestion_agent
|
| 58 |
+
context:
|
| 59 |
+
- emotion_detection_task
|
| 60 |
+
- safety_check_task
|
| 61 |
+
- rag_retrieval_task
|
| 62 |
+
- rag_reading_task
|
| 63 |
+
|
| 64 |
+
orchestrate_task:
|
| 65 |
+
description: >
|
| 66 |
+
Respond to: "{user_input}" with an adaptive tone and length.
|
| 67 |
+
- For greetings or neutral input: be short and friendly.
|
| 68 |
+
- For moderate emotions (e.g., anxiety, stress): respond with 5-6 concise lines.
|
| 69 |
+
- For crisis-level input: give a longer, supportive message with clear help suggestions.
|
| 70 |
+
expected_output: >
|
| 71 |
+
A concise, human-like response adapted to the emotional tone and urgency.
|
| 72 |
+
output_key: final_response
|
| 73 |
+
agent: orchestrator
|
| 74 |
+
context:
|
| 75 |
+
- emotion_detection_task
|
| 76 |
+
- safety_check_task
|
| 77 |
+
- rag_retrieval_task
|
| 78 |
+
- rag_reading_task
|
| 79 |
+
- suggestion_task
|
crew_ai/README.md
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DrukCare
|
| 2 |
+
A skeleton for CrewAI agents for providing mental health assistance.
|
| 3 |
+
## Tagline: Empowering Mental Well-being with Intelligent and Culturally Sensitive Support.
|
| 4 |
+
|
| 5 |
+
# 1. About
|
| 6 |
+
DrukCare AI is an intelligent chatbot application designed to provide empathetic and personalized mental health assistance, specifically tailored for the context of Bhutan. Leveraging the CrewAI framework, this system orchestrates a team of specialized AI agents to guide users through various stages of support, from crisis detection and profile collection to dynamic mental health assessments and personalized recommendations.
|
| 7 |
+
|
| 8 |
+
The project aims to offer accessible, initial mental health guidance, respecting user privacy and cultural nuances, while adhering to ethical guidelines.
|
| 9 |
+
|
| 10 |
+
# 2. Features
|
| 11 |
+
|
| 12 |
+
1. Crisis Detection: Identifies suicidal ideation or mental health emergencies.
|
| 13 |
+
2. Condition Classification: Detects depression, anxiety, substance use, or general mental health concerns.
|
| 14 |
+
3. Interactive Assessments: Conducts PHQ-9, GAD-7, and DAST-10 questionnaires and included questionnaire for alcohol use and bipolar syndrome.
|
| 15 |
+
4. Personalized Recommendations: Offers suggestions including helplines, therapy options, mindfullness, meditation buddha and meditation practices.
|
| 16 |
+
5. RAG-Driven Retrieval: Retrieves relevant documents using a vector store (PostgreSQL pgvector) hosted on Supabase.
|
| 17 |
+
6. User Profile Integration: Adapts recommendations using user history and preferences.
|
| 18 |
+
|
| 19 |
+
# 3. Workflow
|
| 20 |
+
|
| 21 |
+
The DrukCare AI operates as a sequential CrewAI process, ensuring a structured and coherent user interaction flow:
|
| 22 |
+
|
| 23 |
+
1. User inputs a query.
|
| 24 |
+
2. Crisis detection checks for emergencies.
|
| 25 |
+
3. User profile is retrieved or mocked.
|
| 26 |
+
4. Condition classifier suggests a condition and assessment.
|
| 27 |
+
5. User confirms and takes the questionnaire.
|
| 28 |
+
6. Score is calculated and interpreted.
|
| 29 |
+
7. Final recommendation is generated (optionally enhanced via RAG).
|
| 30 |
+
|
| 31 |
+

|
| 32 |
+
|
| 33 |
+
## Crisis Detection:
|
| 34 |
+
|
| 35 |
+
Input: User's initial query.
|
| 36 |
+
|
| 37 |
+
Action: The Crisis Detection Specialist agent analyzes the input for emergency signs. Used the trained model from hugging face as a tool to detect whether the query is crisis or no-crisis.
|
| 38 |
+
|
| 39 |
+
Output: Crisis or No-crisis
|
| 40 |
+
|
| 41 |
+
## User Profile Collection:
|
| 42 |
+
|
| 43 |
+
Input: User's query and status from Crisis Detection.
|
| 44 |
+
|
| 45 |
+
Action: Queries the user profile from PostGreSQL database.
|
| 46 |
+
|
| 47 |
+
Output: A structured user profile in json format.
|
| 48 |
+
|
| 49 |
+
## Knowledge Retrieval & Query Refinement (RAG):
|
| 50 |
+
|
| 51 |
+
Input: User's initial query and collected user profile.
|
| 52 |
+
|
| 53 |
+
Action: The Knowledge Base Manager & Query Refiner agent interprets the user's intent, formulating specific keywords for the vector database. It then retrieves relevant mental health information and identifies a potential condition (e.g., 'depression', 'anxiety').
|
| 54 |
+
|
| 55 |
+
Output: Relevant mental health recommendations and the identified condition.
|
| 56 |
+
|
| 57 |
+
## Conditional Assessment:
|
| 58 |
+
|
| 59 |
+
Input: Identified condition from mental condition classifier crew.
|
| 60 |
+
|
| 61 |
+
Action: The Mental Health Assessment Specialist agent determines if an assessment is relevant (e.g., PHQ-9 for depression, GAD-7 for anxiety). If relevant, it seeks explicit user consent. If consent is given, it administers the questionnaire step-by-step.
|
| 62 |
+
|
| 63 |
+
Output: Assessment status (completed, skipped, denied) and results (score, interpretation) if completed.
|
| 64 |
+
|
| 65 |
+
## Personalized Recommendation:
|
| 66 |
+
|
| 67 |
+
Input: Original user query, collected user profile, RAG results, and assessment results.
|
| 68 |
+
|
| 69 |
+
Action: The Personalized Recommendation Engine synthesizes all gathered information to generate highly personalized, empathetic, and actionable mental health recommendations, culturally adapted for Bhutan.
|
| 70 |
+
|
| 71 |
+
Output: The final comprehensive recommendation to the user.
|
| 72 |
+
|
| 73 |
+
# 4. Architecture/Components
|
| 74 |
+
The application is built using the CrewAI framework, comprising Agents, Tasks, and Tools.
|
| 75 |
+
|
| 76 |
+
## 4.1. Agents
|
| 77 |
+
|
| 78 |
+
1. **CrisisDetectionAgent:** Detects urgent crisis signals in user queries.
|
| 79 |
+
2. **Mental Condition Classifier Agent:** Identifies the likely mental condition.
|
| 80 |
+
3. **Data Retriever Agent:** Fetches user demographic and mental health background.
|
| 81 |
+
4. **RecommendationAgent:** Synthesizes all information and provides actionable recommendations.
|
| 82 |
+
5. **RAG Agent:** Retrieves external documents to augment the response context (RAG).
|
| 83 |
+
|
| 84 |
+
## 4.2. Tasks
|
| 85 |
+
|
| 86 |
+
1. *Crisis detection task:* The crisis condition will be detected using the ‘crisis classifier tool’ . This task is performed by the Crisis Detection agent. Input will be the ‘user_query’.
|
| 87 |
+
2. *Mental condition classifier task:* This will classify the mental health condition from the ‘user_query’ (say anxiety, depression, substance abuse etc) and recommend the questionnaire (for example, if the detected mental health condition is ‘anxiety’, ‘GAD-7’ questionnaire is recommended to the user). This is made optional for the user. This task is performed by Mental Condition Classifier Agent.
|
| 88 |
+
3. *Data retriever task:* This will retrieve the user profile provided the ‘user_profile_id’ from the database. This task is performed by Data Retriever Agent. It has access to the tool to help accomplish the desired task.
|
| 89 |
+
4. *Rag task:* This first retrieves the keywords from the user query, refines the user query if needed and fetches the documents from the vector DB using semantic search.
|
| 90 |
+
5. *Recommendation task:* This is for providing a comprehensive recommendation based on the user profile, condition detected, assessment results and the retrieved documents from vector database. This also has the tool to fetch the helplines for handling crisis situations and also for severe mental health conditions. The agent performing this task is Recommendation Agent.
|
| 91 |
+
|
| 92 |
+
## 4.3. Crews
|
| 93 |
+
|
| 94 |
+
1. **Crisis_management_crew** (crisis detection agent): Runs crisis detection logic.
|
| 95 |
+
2. **Mental_condition_crew** (mental condition classifier agent): Classifies user's mental health condition.
|
| 96 |
+
3. **Data_retrieval_crew** (data retriever agent): Retrieves and formats user profile data.
|
| 97 |
+
4. **Recommendation_crew** (rag agent + recommendation agent): Generates final recommendations using inputs from other agents and RAG.
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
## 4.4 Tools
|
| 101 |
+
|
| 102 |
+
1. *Bhutanese Helplines:* Provides a predefined list of mental health helplines relevant to Bhutan.
|
| 103 |
+
|
| 104 |
+
2. *Vector Database Operations:* Retrieval from vector database for mental health recommendations. Uses 'sentence-transformers/ all-MiniLM-L6-v2' model for embedding queries.
|
| 105 |
+
|
| 106 |
+
3. *Data Retrieval:* To fetch the user profiles from PostGreSQL database. Now it contains some dummy user profiles. The features include: age, gender, city_region, profession, marital status, previous mental diagnosis, ethnicity.
|
| 107 |
+
|
| 108 |
+
4. *Crisis Classifier:* Uses 'sentinet/suicidality' model to detect the crisis condition. Note: lmsdmn/crisis-detection-model model has been finetuned on the specific dataset. If this model is needed to be used, we need to define a new tool for it.
|
| 109 |
+
|
| 110 |
+
# 5. Usage
|
| 111 |
+
|
| 112 |
+
1. Create a python virtual environment.
|
| 113 |
+
|
| 114 |
+
2. Install the dependencies from requirements.txt by running the below command in your terminal:
|
| 115 |
+
|
| 116 |
+
pip install -r requirements.txt
|
| 117 |
+
|
| 118 |
+
3. Then, run the below command in your terminal:
|
| 119 |
+
|
| 120 |
+
python chatbot.py
|
| 121 |
+
|
| 122 |
+
The console output will show the detailed steps of how agents interact, tools are used, and the final recommendations are generated for each simulated user input.
|
| 123 |
+
|
| 124 |
+
## 5.1. LLM API Key Setup
|
| 125 |
+
Crucially, DrukCare AI relies on a Language Model (LLM) to function.
|
| 126 |
+
|
| 127 |
+
You need to set up your LLM provider's API key. For example, if you are using OpenAI:
|
| 128 |
+
|
| 129 |
+
1. Obtain an API key from your chosen LLM provider (e.g., OpenAI API Keys).
|
| 130 |
+
|
| 131 |
+
2. Set it as an environment variable:
|
| 132 |
+
|
| 133 |
+
export OPENAI_API_KEY="YOUR_API_KEY_HERE" # On macOS/Linux
|
| 134 |
+
Or for Windows (in Command Prompt):
|
| 135 |
+
set OPENAI_API_KEY="YOUR_API_KEY_HERE"
|
| 136 |
+
In PowerShell:
|
| 137 |
+
$env:OPENAI_API_KEY="YOUR_API_KEY_HERE"
|
| 138 |
+
|
| 139 |
+
Alternatively, you can hardcode it in your script (for local testing, not recommended for production):
|
| 140 |
+
|
| 141 |
+
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY_HERE"
|
| 142 |
+
|
| 143 |
+
3. Make sure your selected LLM matches the model_name you are using.
|
| 144 |
+
|
| 145 |
+
## 5.2 Setting up connection to Supabase
|
| 146 |
+
|
| 147 |
+
Setup the SUPABASE_DB_URI in your environment in .env file. SUPABASE_DB_URI can be requested from Alaa or Adelia.
|
| 148 |
+
|
| 149 |
+
## 5.3 Langsmith setup for Tracing and Monitoring
|
| 150 |
+
|
| 151 |
+
Prerequisites:
|
| 152 |
+
|
| 153 |
+
1. LangSmith Account and Project: Ensure you have a LangSmith account and a project where your LLM runs are being traced. Setting LANGSMITH_TRACING=true and LANGSMITH_PROJECT environment variables usually handles this.
|
| 154 |
+
2. API Key: Have your LANGSMITH_API_KEY ready.
|
| 155 |
+
3. LLM Provider API Key: Since your evaluator will be an LLM, you'll need the API key for the LLM provider you plan to use for the evaluation (e.g., OPENAI_API_KEY, ANTHROPIC_API_KEY). You'll need to add these as "Secrets" in LangSmith (Settings -> Secrets -> Add Secret).
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# Disclaimer
|
| 159 |
+
|
| 160 |
+
This DrukCare AI chatbot is designed for informational and initial supportive purposes only. It is not a substitute for professional medical advice, diagnosis, or treatment. Always seek the advice of a qualified mental health professional for any questions you may have regarding a medical condition. If you are in a crisis situation, please contact the provided helplines immediately.
|
| 161 |
+
|
| 162 |
+
# License
|
| 163 |
+
|
| 164 |
+
Will be updated later.
|
crew_ai/__init__.py
ADDED
|
File without changes
|
crew_ai/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (195 Bytes). View file
|
|
|
crew_ai/__pycache__/chatbot.cpython-311.pyc
ADDED
|
Binary file (18.8 kB). View file
|
|
|
crew_ai/__pycache__/config.cpython-311.pyc
ADDED
|
Binary file (1.87 kB). View file
|
|
|
crew_ai/__pycache__/data_retriever_util.cpython-311.pyc
ADDED
|
Binary file (5.44 kB). View file
|
|
|