import logging import os import re import json import numpy as np from dotenv import load_dotenv import httpx from langdetect import detect from deep_translator import GoogleTranslator from tenacity import retry, stop_after_attempt, wait_exponential # Removed unused imports to fix diagnostics # Try to import optional dependencies try: import pyodbc PYODBC_AVAILABLE = True except ImportError: PYODBC_AVAILABLE = False logging.warning("pyodbc not available - Azure SQL Server connection disabled") try: import faiss FAISS_AVAILABLE = True except ImportError: FAISS_AVAILABLE = False logging.warning("FAISS not available - semantic search disabled") try: from sentence_transformers import SentenceTransformer, util TRANSFORMERS_AVAILABLE = True except ImportError: TRANSFORMERS_AVAILABLE = False logging.warning("SentenceTransformers not available - semantic search disabled") # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # Load environment variables load_dotenv() OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions" # Azure Database Configuration DB_SERVER = os.getenv("DB_SERVER", "dev-hockeypraktijk.database.windows.net") DB_USER = os.getenv("DB_USER", "dev-HockeyPraktijk-user") DB_PASSWORD = os.getenv("DB_PASSWORD", "~\\5a]LCBD)A[ list: """Search Azure database for Exercise, Serie, and Multimedia content""" if not PYODBC_AVAILABLE: logging.info("pyodbc not available - skipping database search") return [] conn = get_azure_connection() if not conn: return [] try: cursor = conn.cursor() results = [] # Search Exercise table cursor.execute(""" SELECT TOP 3 Title, Description, 'exercise' as ContentType, 'Exercise' as SourceTable FROM Exercise WHERE Title LIKE ? OR Description LIKE ? ORDER BY Title """, (f"%{query}%", f"%{query}%")) for row in cursor.fetchall(): results.append({ "title": row.Title or "No title", "content": row.Description or "No description", "type": row.ContentType, "source_table": row.SourceTable, "similarity": 0.8 # Static similarity for keyword match }) # Search Serie table cursor.execute(""" SELECT TOP 3 Title, Description, 'serie' as ContentType, 'Serie' as SourceTable FROM Serie WHERE Title LIKE ? OR Description LIKE ? ORDER BY Title """, (f"%{query}%", f"%{query}%")) for row in cursor.fetchall(): results.append({ "title": row.Title or "No title", "content": row.Description or "No description", "type": row.ContentType, "source_table": row.SourceTable, "similarity": 0.8 }) # Search Multimedia table cursor.execute(""" SELECT TOP 3 Title, URL, 'multimedia' as ContentType, 'Multimedia' as SourceTable FROM Multimedia WHERE Title LIKE ? ORDER BY Title """, (f"%{query}%",)) for row in cursor.fetchall(): multimedia_result = { "title": row.Title or "No title", "type": row.ContentType, "source_table": row.SourceTable, "similarity": 0.8 } # Ensure URL is properly included for multimedia items if row.URL: multimedia_result["url"] = row.URL else: multimedia_result["url"] = "" results.append(multimedia_result) conn.close() logging.info(f"Found {len(results)} results from Azure database") return results[:5] # Limit to top 5 results except Exception as e: logging.error(f"Error searching Azure database: {str(e)}") if conn: conn.close() return [] def preprocess_prompt(prompt: str, user_lang: str) -> tuple[str, str]: """Preprocess prompt and return both translated and original prompt""" if not prompt or not isinstance(prompt, str): return prompt, prompt prompt_lower = prompt.lower().strip() if user_lang == "nl": # Apply hockey-specific translations for dutch_term, english_term in hockey_translation_dict.items(): prompt_lower = re.sub(rf'\b{re.escape(dutch_term)}\b', english_term, prompt_lower) try: translated = GoogleTranslator(source="nl", target="en").translate(prompt_lower) return translated if translated else prompt_lower, prompt except Exception as e: logging.error(f"Translation error: {str(e)}") return prompt_lower, prompt return prompt_lower, prompt def is_in_domain(prompt: str) -> bool: """Check if prompt is hockey-related - improved semantic understanding""" if not prompt or not isinstance(prompt, str): return False # Clean prompt - remove punctuation for analysis but keep original for logging prompt_clean = re.sub(r'[^\w\s]', ' ', prompt.lower().strip()) prompt_clean = re.sub(r'\s+', ' ', prompt_clean).strip() # Remove extra spaces if not prompt_clean: return False # Enhanced keyword detection has_hockey_keywords = any( re.search(rf'\b{re.escape(word)}\b|\b{re.escape(word[:-1])}\w*\b', prompt_clean) for word in hockey_keywords ) # Enhanced semantic similarity with multiple reference points has_hockey_semantic = False if sentence_model is not None and TRANSFORMERS_AVAILABLE: try: prompt_embedding = sentence_model.encode(prompt_clean, convert_to_tensor=True) # Multiple hockey reference embeddings for better coverage hockey_references = [ "field hockey training drills strategies rules techniques tutorials", "hockey penalty shoot shooting skills practice improve", "hockey coaching player development exercises", "hockey stick handling ball control techniques" ] max_similarity = 0.0 for ref in hockey_references: hockey_embedding = sentence_model.encode(ref, convert_to_tensor=True) similarity = util.cos_sim(prompt_embedding, hockey_embedding).item() max_similarity = max(max_similarity, similarity) has_hockey_semantic = max_similarity > 0.4 # Higher threshold to reduce false positives logging.debug(f"Hockey domain check for '{prompt}': keywords={has_hockey_keywords}, " f"semantic={has_hockey_semantic} (score: {max_similarity:.3f})") except Exception as e: logging.warning(f"Semantic similarity check failed: {e}") return has_hockey_keywords or has_hockey_semantic def is_greeting_or_vague(prompt: str, user_lang: str = "en") -> bool: """Check if prompt is a greeting or too vague - improved semantic detection""" if not prompt or not isinstance(prompt, str): return True # Clean prompt - remove punctuation for analysis prompt_clean = re.sub(r'[^\w\s]', '', prompt.lower().strip()) if not prompt_clean: # If only punctuation, treat as vague return True # Check for pure greetings (only greeting words, no content) words = prompt_clean.split() greeting_words = [word for word in words if word in greetings] total_words = len(words) # If greeting words make up most of the prompt (>70%), it's likely just a greeting greeting_ratio = len(greeting_words) / total_words if total_words > 0 else 0 # Check for hockey keywords has_hockey_keywords = any( re.search(rf'\b{re.escape(word)}\b|\b{re.escape(word[:-1])}\w*\b', prompt_clean) for word in hockey_keywords ) # Check for hockey-related semantic content using sentence transformer has_hockey_semantic = False if sentence_model is not None and TRANSFORMERS_AVAILABLE: try: prompt_embedding = sentence_model.encode(prompt_clean, convert_to_tensor=True) hockey_reference = "hockey training drills techniques penalty shoot skills practice" hockey_embedding = sentence_model.encode(hockey_reference, convert_to_tensor=True) similarity = util.cos_sim(prompt_embedding, hockey_embedding).item() has_hockey_semantic = similarity > 0.4 # Higher threshold for semantic detection logging.debug(f"Semantic hockey similarity for '{prompt}': {similarity:.3f}") except Exception as e: logging.debug(f"Semantic similarity check failed: {e}") # Special case: single greeting words should be treated as greetings if total_words == 1 and greeting_words: logging.debug(f"Single greeting word detected: '{prompt_clean}'") return True # It's a greeting ONLY if: # 1. High ratio of greeting words (>50% for multi-word, or single greeting word) AND # 2. No hockey keywords AND # 3. No semantic hockey content is_pure_greeting = ( (greeting_ratio >= 0.5 or (total_words <= 2 and greeting_words)) and not has_hockey_keywords and not has_hockey_semantic ) logging.debug(f"Greeting analysis for '{prompt}' (lang: {user_lang}): greeting_ratio={greeting_ratio:.2f}, " f"has_hockey_keywords={has_hockey_keywords}, has_hockey_semantic={has_hockey_semantic}, " f"is_pure_greeting={is_pure_greeting}") return is_pure_greeting def search_hockey_content(english_query: str, dutch_query: str = "") -> list: """Search hockey database content using both semantic similarity and Azure database""" if not is_in_domain(english_query): logging.info("Query is out of domain, skipping database search.") return [] results = [] # First try Azure database search with keywords (try both languages if available) azure_results = search_azure_database_content(english_query) results.extend(azure_results) # Also search with Dutch query if provided and different from English if dutch_query and dutch_query.lower() != english_query.lower(): dutch_results = search_azure_database_content(dutch_query) results.extend(dutch_results) # If Azure search didn't return enough results and we have embeddings, use FAISS search as fallback if len(results) < 3 and (sentence_model is not None and faiss_index is not None and embeddings_np is not None and len(metadata) > 0 and FAISS_AVAILABLE): try: # Encode query english_embedding = sentence_model.encode(english_query, convert_to_tensor=False) english_embedding = np.array(english_embedding).astype("float32").reshape(1, -1) faiss.normalize_L2(english_embedding) # Search FAISS index distances, indices = faiss_index.search(english_embedding, 5) # Top 5 results if len(indices) > 0 and len(distances) > 0: for idx, sim in zip(indices[0], distances[0]): if int(idx) < len(metadata) and float(sim) > 0.3: # Similarity threshold item = metadata[int(idx)] result = { "title": item["title"], "type": item.get("type", "unknown"), "source_table": item.get("source_table", "unknown"), "similarity": float(sim) } # Add URL for multimedia items if item.get("type") == "multimedia" and "url" in item: result["url"] = item["url"] else: result["content"] = item.get("content", "") results.append(result) logging.info(f"Added {len(results) - len(azure_results)} FAISS results") except Exception as e: logging.error(f"FAISS search error: {e}") # Remove duplicates and limit results seen_titles = set() unique_results = [] for result in results: if result["title"] not in seen_titles: seen_titles.add(result["title"]) unique_results.append(result) logging.info(f"Found {len(unique_results)} total relevant content items") return unique_results[:5] # Limit to top 5 def get_conversation_history(user_role: str, user_team: str) -> str: """Get conversation history for user session""" session_key = f"{user_role}|{user_team}" history = conversation_histories.get(session_key, []) formatted_history = "\n".join([f"User: {q}\nCoach: {a}" for q, a in history[-3:]]) return formatted_history def update_conversation_history(user_role: str, user_team: str, question: str, answer: str): """Update conversation history for user session""" session_key = f"{user_role}|{user_team}" history = conversation_histories.get(session_key, []) history.append((question, answer)) conversation_histories[session_key] = history[-3:] def translate_text(text: str, source_lang: str, target_lang: str) -> str: """Translate text between languages""" if not text or not isinstance(text, str) or source_lang == target_lang: return text try: translated = GoogleTranslator(source=source_lang, target=target_lang).translate(text) return translated except Exception as e: logging.error(f"Translation error: {str(e)}") return text @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) async def agentic_hockey_chat(user_active_role: str, user_team: str, user_prompt: str) -> dict: """Main chat function with hockey database integration""" logging.info(f"Processing question: {user_prompt}, role: {user_active_role}, team: {user_team}") # Sanitize user prompt if not user_prompt or not isinstance(user_prompt, str): logging.error("Invalid or empty user_prompt.") return {"ai_response": "Question cannot be empty.", "recommended_content_details": []} user_prompt = re.sub(r'\s+', ' ', user_prompt.strip()) # Improved language detection - clean text first and handle edge cases try: # Clean text for better language detection clean_text = re.sub(r'[^\w\s]', ' ', user_prompt) clean_text = re.sub(r'\s+', ' ', clean_text).strip() if len(clean_text) >= 2: # Need at least 2 characters for detection user_lang = detect(clean_text) if user_lang not in ["en", "nl"]: user_lang = "en" else: user_lang = "en" # Default for very short inputs except Exception as e: logging.debug(f"Language detection failed for '{user_prompt}': {e}") user_lang = "en" # Get both translated and original prompts processing_prompt, original_prompt = preprocess_prompt(user_prompt, user_lang) logging.info(f"Processing prompt: {processing_prompt}") # Handle greetings (only pure greetings, not hockey questions with greetings) if is_greeting_or_vague(user_prompt, user_lang): logging.info(f"Detected pure greeting (no hockey content): '{user_prompt}' (lang: {user_lang})") if user_lang == "nl": answer = "Hallo! Ik ben hier om je te helpen met hockey-gerelateerde vragen. Vraag me gerust iets over training, oefeningen, technieken of strategieën!" else: answer = "Hello! I'm here to help you with hockey-related questions. Feel free to ask me about training, drills, techniques, or strategies!" update_conversation_history(user_active_role, user_team, user_prompt, answer) return {"ai_response": answer, "recommended_content_details": []} # Check domain if not is_in_domain(processing_prompt): answer = "Sorry, I can only assist with questions about hockey, such as training, drills, strategies, rules, and tutorials. Please ask a hockey-related question!" if user_lang == "en" else "Sorry, ik kan alleen helpen met vragen over hockey, zoals training, oefeningen, strategieën, regels en tutorials. Stel me een hockeygerelateerde vraag!" update_conversation_history(user_active_role, user_team, user_prompt, answer) return {"ai_response": answer, "recommended_content_details": []} history = get_conversation_history(user_active_role, user_team) system_prompt = ( f"You are an AI Assistant Bot specialized in field hockey, including training, drills, strategies, rules, and more. " f"You communicate with a {user_active_role} from the team {user_team}. " f"Provide concise, practical, and specific answers tailored to the user's role and team. " f"Focus on field hockey-related topics such as training, drills, strategies, rules, and tutorials.\n\n" f"Recent conversation:\n{history or 'No previous conversations.'}\n\n" f"Answer the following question in English:\n{processing_prompt}" ) payload = { "model": "openai/gpt-4o", "messages": [ {"role": "system", "content": system_prompt} ], "max_tokens": 150, "temperature": 0.3, "top_p": 0.9 } headers = { "Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json" } try: if not OPENROUTER_API_KEY: return {"ai_response": "OpenRouter API key not configured. Please set OPENROUTER_API_KEY environment variable.", "recommended_content_details": []} logging.info("Making OpenRouter API call...") async with httpx.AsyncClient(timeout=30) as client: response = await client.post(OPENROUTER_API_URL, json=payload, headers=headers) response.raise_for_status() data = response.json() answer = data.get("choices", [{}])[0].get("message", {}).get("content", "").strip() if not answer: logging.error("No answer received from OpenRouter API.") return {"ai_response": "No answer received from the API.", "recommended_content_details": []} # Remove URLs from answer and translate answer = re.sub(r'https?://\S+', '', answer).strip() answer = translate_text(answer, "en", user_lang) # Search for recommended content from Azure DB and embeddings logging.info("Searching for relevant content...") recommended_content = search_hockey_content(processing_prompt, original_prompt if user_lang == "nl" else "") # Format recommended content details recommended_content_details = [] for item in recommended_content: content_detail = { "title": item["title"], "type": item["type"], "source": item["source_table"], "similarity": item["similarity"] } # Add URL for multimedia items, content for others if item["type"] == "multimedia": # Ensure URL is included for multimedia items from Azure DB content_detail["url"] = item.get("url", "") logging.debug(f"Multimedia item: {item['title']} - URL: {content_detail['url']}") else: # For exercise and serie items, include content/description content_detail["content"] = item.get("content", "") recommended_content_details.append(content_detail) update_conversation_history(user_active_role, user_team, user_prompt, answer) return {"ai_response": answer, "recommended_content_details": recommended_content_details} except httpx.HTTPStatusError as e: logging.error(f"OpenRouter API error: Status {e.response.status_code}") return {"ai_response": f"API error: {e.response.status_code}", "recommended_content_details": []} except httpx.TimeoutException: logging.error("OpenRouter API timeout") return {"ai_response": "Request timed out. Please try again.", "recommended_content_details": []} except httpx.NetworkError as e: logging.error(f"Network error: {str(e)}") return {"ai_response": "Network error occurred. Please check your connection and try again.", "recommended_content_details": []} except Exception as e: logging.error(f"Internal error: {str(e)}") return {"ai_response": f"Internal error: {str(e)}", "recommended_content_details": []} # Initialize resources on import - graceful fallback for HuggingFace try: load_resources() logging.info("Successfully initialized Hockey Mind AI") except Exception as e: logging.warning(f"Failed to initialize full resources: {e}") logging.info("Running in basic mode - hockey advice available without advanced features")