PawMatchAI / query_understanding.py
DawnC's picture
Update query_understanding.py
089125f verified
import re
import json
import numpy as np
import sqlite3
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, field
import traceback
from sentence_transformers import SentenceTransformer
from dog_database import get_dog_description
from breed_health_info import breed_health_info
from breed_noise_info import breed_noise_info
@dataclass
class QueryDimensions:
"""Structured query intent data structure"""
spatial_constraints: List[str] = field(default_factory=list)
activity_level: List[str] = field(default_factory=list)
noise_preferences: List[str] = field(default_factory=list)
size_preferences: List[str] = field(default_factory=list)
family_context: List[str] = field(default_factory=list)
maintenance_level: List[str] = field(default_factory=list)
special_requirements: List[str] = field(default_factory=list)
breed_mentions: List[str] = field(default_factory=list)
confidence_scores: Dict[str, float] = field(default_factory=dict)
@dataclass
class DimensionalSynonyms:
"""Dimensional synonyms dictionary structure"""
spatial: Dict[str, List[str]] = field(default_factory=dict)
activity: Dict[str, List[str]] = field(default_factory=dict)
noise: Dict[str, List[str]] = field(default_factory=dict)
size: Dict[str, List[str]] = field(default_factory=dict)
family: Dict[str, List[str]] = field(default_factory=dict)
maintenance: Dict[str, List[str]] = field(default_factory=dict)
special: Dict[str, List[str]] = field(default_factory=dict)
class QueryUnderstandingEngine:
"""
多維度語義查詢理解引擎
支援中英文自然語言理解並轉換為結構化品種推薦查詢
"""
def __init__(self):
"""初始化查詢理解引擎"""
self.sbert_model = None
self._sbert_loading_attempted = False
self.breed_list = self._load_breed_list()
self.synonyms = self._initialize_synonyms()
self.semantic_templates = {}
# 延遲SBERT載入直到需要時才在GPU環境中進行
print("QueryUnderstandingEngine initialized (SBERT loading deferred)")
def _load_breed_list(self) -> List[str]:
"""載入品種清單"""
try:
conn = sqlite3.connect('animal_detector.db')
cursor = conn.cursor()
cursor.execute("SELECT DISTINCT Breed FROM AnimalCatalog")
breeds = [row[0] for row in cursor.fetchall()]
cursor.close()
conn.close()
return breeds
except Exception as e:
print(f"Error loading breed list: {str(e)}")
# 備用品種清單
return ['Labrador_Retriever', 'German_Shepherd', 'Golden_Retriever',
'Bulldog', 'Poodle', 'Beagle', 'Border_Collie', 'Yorkshire_Terrier']
def _initialize_sbert_model(self):
"""初始化 SBERT 模型 - 延遲載入以避免ZeroGPU CUDA初始化問題"""
if self.sbert_model is not None or getattr(self, '_sbert_loading_attempted', False):
return self.sbert_model
try:
print("Loading SBERT model for query understanding in GPU context...")
model_options = ['all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'all-MiniLM-L12-v2']
for model_name in model_options:
try:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.sbert_model = SentenceTransformer(model_name, device=device)
print(f"SBERT model {model_name} loaded successfully for query understanding on {device}")
return self.sbert_model
except Exception as e:
print(f"Failed to load {model_name}: {str(e)}")
continue
print("All SBERT models failed to load. Using keyword-only analysis.")
self.sbert_model = None
return None
except Exception as e:
print(f"Failed to initialize SBERT model: {str(e)}")
self.sbert_model = None
return None
finally:
self._sbert_loading_attempted = True
def _initialize_synonyms(self) -> DimensionalSynonyms:
"""初始化多維度同義詞字典"""
return DimensionalSynonyms(
spatial={
'apartment': ['apartment', 'flat', 'condo', 'small space', 'city living',
'urban', 'no yard', 'indoor'],
'house': ['house', 'home', 'yard', 'garden', 'backyard', 'large space',
'suburban', 'rural', 'farm']
},
activity={
'low': ['low activity', 'sedentary', 'couch potato', 'minimal exercise',
'indoor lifestyle', 'lazy', 'calm'],
'moderate': ['moderate activity', 'daily walks', 'light exercise',
'regular walks'],
'high': ['high activity', 'energetic', 'active', 'exercise', 'hiking',
'running', 'jogging', 'outdoor sports']
},
noise={
'low': ['quiet', 'silent', 'no barking', 'peaceful', 'low noise',
'rarely barks', 'soft-spoken'],
'moderate': ['moderate barking', 'occasional barking'],
'high': ['loud', 'barking', 'vocal', 'noisy', 'frequent barking',
'alert dog']
},
size={
'small': ['small', 'tiny', 'little', 'compact', 'miniature', 'toy',
'lap dog'],
'medium': ['medium', 'moderate size', 'average', 'mid-sized'],
'large': ['large', 'big', 'giant', 'huge', 'massive', 'great']
},
family={
'children': ['children', 'kids', 'family', 'child-friendly', 'toddler',
'baby', 'school age'],
'elderly': ['elderly', 'senior', 'old people', 'retirement', 'aged'],
'single': ['single', 'alone', 'individual', 'solo', 'myself']
},
maintenance={
'low': ['low maintenance', 'easy care', 'simple', 'minimal grooming',
'wash and go'],
'moderate': ['moderate maintenance', 'regular grooming'],
'high': ['high maintenance', 'professional grooming', 'daily brushing',
'care intensive']
},
special={
'guard': ['guard dog', 'protection', 'security', 'watchdog',
'protective', 'defender'],
'companion': ['companion', 'therapy', 'emotional support', 'comfort',
'cuddly', 'lap dog'],
'hypoallergenic': ['hypoallergenic', 'allergies', 'non-shedding',
'allergy-friendly', 'no shed'],
'first_time': ['first time', 'beginner', 'new to dogs', 'inexperienced',
'never owned']
}
)
def _build_semantic_templates(self):
"""建立語義模板向量(僅在 SBERT 可用時)"""
# Initialize SBERT model if needed
if self.sbert_model is None:
self._initialize_sbert_model()
if not self.sbert_model:
return
try:
# 為每個維度建立模板句子
templates = {
'spatial_apartment': "I live in an apartment with limited space and no yard",
'spatial_house': "I live in a house with a large yard and outdoor space",
'activity_low': "I prefer a calm, low-energy dog that doesn't need much exercise",
'activity_high': "I want an active, energetic dog for hiking and outdoor activities",
'noise_low': "I need a quiet dog that rarely barks and won't disturb neighbors",
'noise_high': "I don't mind a vocal dog that barks and makes noise",
'size_small': "I prefer small, compact dogs that are easy to handle",
'size_large': "I want a large, impressive dog with strong presence",
'family_children': "I have young children and need a child-friendly dog",
'family_elderly': "I'm looking for a calm companion dog for elderly person",
'maintenance_low': "I want a low-maintenance dog that's easy to care for",
'maintenance_high': "I don't mind high-maintenance dogs requiring professional grooming"
}
# 生成模板向量
for key, template in templates.items():
embedding = self.sbert_model.encode(template, convert_to_tensor=False)
self.semantic_templates[key] = embedding
print(f"Built {len(self.semantic_templates)} semantic templates")
except Exception as e:
print(f"Error building semantic templates: {str(e)}")
self.semantic_templates = {}
def analyze_query(self, user_input: str) -> QueryDimensions:
"""
分析使用者查詢並提取多維度意圖
Args:
user_input: 使用者的自然語言查詢
Returns:
QueryDimensions: 結構化的查詢維度
"""
try:
# 正規化輸入文字
normalized_input = user_input.lower().strip()
# 基於關鍵字的維度分析
dimensions = self._extract_keyword_dimensions(normalized_input)
# 如果 SBERT 可用,進行語義分析增強
if self.sbert_model is None:
self._initialize_sbert_model()
if self.sbert_model:
semantic_dimensions = self._extract_semantic_dimensions(user_input)
dimensions = self._merge_dimensions(dimensions, semantic_dimensions)
# 提取品種提及
dimensions.breed_mentions = self._extract_breed_mentions(normalized_input)
# 計算信心分數
dimensions.confidence_scores = self._calculate_confidence_scores(dimensions, user_input)
return dimensions
except Exception as e:
print(f"Error analyzing query: {str(e)}")
print(traceback.format_exc())
# 回傳空的維度結構
return QueryDimensions()
def _extract_keyword_dimensions(self, text: str) -> QueryDimensions:
"""基於關鍵字提取維度"""
dimensions = QueryDimensions()
# 空間限制分析
for category, keywords in self.synonyms.spatial.items():
if any(keyword in text for keyword in keywords):
dimensions.spatial_constraints.append(category)
# 活動水平分析
for level, keywords in self.synonyms.activity.items():
if any(keyword in text for keyword in keywords):
dimensions.activity_level.append(level)
# 噪音偏好分析
for level, keywords in self.synonyms.noise.items():
if any(keyword in text for keyword in keywords):
dimensions.noise_preferences.append(level)
# 尺寸偏好分析
for size, keywords in self.synonyms.size.items():
if any(keyword in text for keyword in keywords):
dimensions.size_preferences.append(size)
# 家庭情況分析
for context, keywords in self.synonyms.family.items():
if any(keyword in text for keyword in keywords):
dimensions.family_context.append(context)
# 維護水平分析
for level, keywords in self.synonyms.maintenance.items():
if any(keyword in text for keyword in keywords):
dimensions.maintenance_level.append(level)
# 特殊需求分析
for requirement, keywords in self.synonyms.special.items():
if any(keyword in text for keyword in keywords):
dimensions.special_requirements.append(requirement)
return dimensions
def _extract_semantic_dimensions(self, text: str) -> QueryDimensions:
"""基於語義相似度提取維度(需要 SBERT)"""
if not self.sbert_model or not self.semantic_templates:
return QueryDimensions()
try:
# 生成查詢向量
query_embedding = self.sbert_model.encode(text, convert_to_tensor=False)
dimensions = QueryDimensions()
# 計算與各個模板的相似度
similarities = {}
for template_key, template_embedding in self.semantic_templates.items():
similarity = np.dot(query_embedding, template_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(template_embedding)
)
similarities[template_key] = similarity
# 設定相似度閾值
threshold = 0.5
# 根據相似度提取維度
for template_key, similarity in similarities.items():
if similarity > threshold:
if template_key.startswith('spatial_'):
category = template_key.replace('spatial_', '')
if category not in dimensions.spatial_constraints:
dimensions.spatial_constraints.append(category)
elif template_key.startswith('activity_'):
level = template_key.replace('activity_', '')
if level not in dimensions.activity_level:
dimensions.activity_level.append(level)
elif template_key.startswith('noise_'):
level = template_key.replace('noise_', '')
if level not in dimensions.noise_preferences:
dimensions.noise_preferences.append(level)
elif template_key.startswith('size_'):
size = template_key.replace('size_', '')
if size not in dimensions.size_preferences:
dimensions.size_preferences.append(size)
elif template_key.startswith('family_'):
context = template_key.replace('family_', '')
if context not in dimensions.family_context:
dimensions.family_context.append(context)
elif template_key.startswith('maintenance_'):
level = template_key.replace('maintenance_', '')
if level not in dimensions.maintenance_level:
dimensions.maintenance_level.append(level)
return dimensions
except Exception as e:
print(f"Error in semantic dimension extraction: {str(e)}")
return QueryDimensions()
def _extract_breed_mentions(self, text: str) -> List[str]:
"""提取品種提及"""
mentioned_breeds = []
for breed in self.breed_list:
# 將品種名稱轉換為顯示格式
breed_display = breed.replace('_', ' ').lower()
breed_words = breed_display.split()
# 檢查品種名稱是否在文字中
breed_found = False
# 完整品種名稱匹配
if breed_display in text:
breed_found = True
else:
# 部分匹配(至少匹配品種名稱的主要部分)
main_word = breed_words[0] if breed_words else ""
if len(main_word) > 3 and main_word in text:
breed_found = True
if breed_found:
mentioned_breeds.append(breed)
return mentioned_breeds
def _merge_dimensions(self, keyword_dims: QueryDimensions,
semantic_dims: QueryDimensions) -> QueryDimensions:
"""合併關鍵字和語義維度"""
merged = QueryDimensions()
# 合併各個維度的結果(去重)
merged.spatial_constraints = list(set(
keyword_dims.spatial_constraints + semantic_dims.spatial_constraints
))
merged.activity_level = list(set(
keyword_dims.activity_level + semantic_dims.activity_level
))
merged.noise_preferences = list(set(
keyword_dims.noise_preferences + semantic_dims.noise_preferences
))
merged.size_preferences = list(set(
keyword_dims.size_preferences + semantic_dims.size_preferences
))
merged.family_context = list(set(
keyword_dims.family_context + semantic_dims.family_context
))
merged.maintenance_level = list(set(
keyword_dims.maintenance_level + semantic_dims.maintenance_level
))
merged.special_requirements = list(set(
keyword_dims.special_requirements + semantic_dims.special_requirements
))
return merged
def _calculate_confidence_scores(self, dimensions: QueryDimensions,
original_text: str) -> Dict[str, float]:
"""計算各維度的信心分數"""
confidence_scores = {}
# 基於匹配的關鍵字數量計算信心分數
text_length = len(original_text.split())
# 空間限制信心分數
spatial_matches = len(dimensions.spatial_constraints)
confidence_scores['spatial'] = min(1.0, spatial_matches * 0.5)
# 活動水平信心分數
activity_matches = len(dimensions.activity_level)
confidence_scores['activity'] = min(1.0, activity_matches * 0.5)
# 噪音偏好信心分數
noise_matches = len(dimensions.noise_preferences)
confidence_scores['noise'] = min(1.0, noise_matches * 0.5)
# 尺寸偏好信心分數
size_matches = len(dimensions.size_preferences)
confidence_scores['size'] = min(1.0, size_matches * 0.5)
# 家庭情況信心分數
family_matches = len(dimensions.family_context)
confidence_scores['family'] = min(1.0, family_matches * 0.5)
# 維護水平信心分數
maintenance_matches = len(dimensions.maintenance_level)
confidence_scores['maintenance'] = min(1.0, maintenance_matches * 0.5)
# 特殊需求信心分數
special_matches = len(dimensions.special_requirements)
confidence_scores['special'] = min(1.0, special_matches * 0.5)
# 品種提及信心分數
breed_matches = len(dimensions.breed_mentions)
confidence_scores['breeds'] = min(1.0, breed_matches * 0.3)
# 整體信心分數(基於總匹配數量和文字長度)
total_matches = sum([
spatial_matches, activity_matches, noise_matches, size_matches,
family_matches, maintenance_matches, special_matches, breed_matches
])
confidence_scores['overall'] = min(1.0, total_matches / max(1, text_length * 0.1))
return confidence_scores
def get_dimension_summary(self, dimensions: QueryDimensions) -> Dict[str, Any]:
"""獲取維度摘要信息"""
return {
'spatial_constraints': dimensions.spatial_constraints,
'activity_level': dimensions.activity_level,
'noise_preferences': dimensions.noise_preferences,
'size_preferences': dimensions.size_preferences,
'family_context': dimensions.family_context,
'maintenance_level': dimensions.maintenance_level,
'special_requirements': dimensions.special_requirements,
'breed_mentions': [breed.replace('_', ' ') for breed in dimensions.breed_mentions],
'confidence_scores': dimensions.confidence_scores,
'total_dimensions_detected': sum([
len(dimensions.spatial_constraints),
len(dimensions.activity_level),
len(dimensions.noise_preferences),
len(dimensions.size_preferences),
len(dimensions.family_context),
len(dimensions.maintenance_level),
len(dimensions.special_requirements)
])
}
def analyze_user_query(user_input: str) -> QueryDimensions:
"""
便利函數:分析使用者查詢
Args:
user_input: 使用者的自然語言查詢
Returns:
QueryDimensions: 結構化的查詢維度
"""
engine = QueryUnderstandingEngine()
return engine.analyze_query(user_input)
def get_query_summary(user_input: str) -> Dict[str, Any]:
"""
便利函數:獲取查詢摘要
Args:
user_input: 使用者的自然語言查詢
Returns:
Dict: 查詢維度摘要
"""
engine = QueryUnderstandingEngine()
dimensions = engine.analyze_query(user_input)
return engine.get_dimension_summary(dimensions)