PawMatchAI / natural_language_processor.py
DawnC's picture
Upload 18 files
1e4c9bc verified
import re
import string
from typing import Dict, List, Tuple, Optional, Any
import traceback
class NaturalLanguageProcessor:
"""
Natural language processing utility class
Handles text preprocessing and keyword extraction for user input
"""
def __init__(self):
"""Initialize the natural language processor"""
self.stop_words = {
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
'to', 'was', 'will', 'with', 'would', 'i', 'me', 'my', 'we', 'us',
'our', 'you', 'your', 'they', 'them', 'their'
}
# Breed name mappings (common aliases to standard names)
self.breed_aliases = {
'lab': 'labrador_retriever',
'labrador': 'labrador_retriever',
'golden': 'golden_retriever',
'retriever': ['labrador_retriever', 'golden_retriever'],
'german shepherd': 'german_shepherd',
'shepherd': 'german_shepherd',
'border collie': 'border_collie',
'collie': ['border_collie', 'collie'],
'bulldog': ['french_bulldog', 'english_bulldog'],
'french bulldog': 'french_bulldog',
'poodle': ['standard_poodle', 'miniature_poodle', 'toy_poodle'],
'husky': 'siberian_husky',
'siberian husky': 'siberian_husky',
'beagle': 'beagle',
'yorkshire terrier': 'yorkshire_terrier',
'yorkie': 'yorkshire_terrier',
'chihuahua': 'chihuahua',
'dachshund': 'dachshund',
'wiener dog': 'dachshund',
'rottweiler': 'rottweiler',
'rottie': 'rottweiler',
'boxer': 'boxer',
'great dane': 'great_dane',
'dane': 'great_dane',
'mastiff': ['bull_mastiff', 'tibetan_mastiff'],
'pitbull': 'american_staffordshire_terrier',
'pit bull': 'american_staffordshire_terrier',
'shih tzu': 'shih-tzu',
'maltese': 'maltese_dog',
'pug': 'pug',
'basset hound': 'basset',
'bloodhound': 'bloodhound',
'australian shepherd': 'kelpie',
'aussie': 'kelpie'
}
# Lifestyle keyword mappings
self.lifestyle_keywords = {
'living_space': {
'apartment': ['apartment', 'flat', 'condo', 'small space', 'city living', 'urban'],
'house': ['house', 'home', 'yard', 'garden', 'suburban', 'large space'],
'farm': ['farm', 'rural', 'country', 'acreage', 'ranch']
},
'activity_level': {
'very_high': ['very active', 'extremely energetic', 'marathon runner', 'athlete'],
'high': ['active', 'energetic', 'exercise', 'hiking', 'running', 'outdoor activities',
'sports', 'jogging', 'biking', 'adventure'],
'moderate': ['moderate exercise', 'some activity', 'weekend walks', 'occasional exercise'],
'low': ['calm', 'lazy', 'indoor', 'low energy', 'couch potato', 'sedentary', 'quiet lifestyle']
},
'family_situation': {
'children': ['children', 'kids', 'toddlers', 'babies', 'family with children', 'young family'],
'elderly': ['elderly', 'senior', 'old', 'retired', 'senior citizen'],
'single': ['single', 'alone', 'individual', 'bachelor', 'solo'],
'couple': ['couple', 'two people', 'pair', 'duo']
},
'noise_tolerance': {
'low': ['quiet', 'silent', 'noise-sensitive', 'peaceful', 'no barking', 'minimal noise'],
'moderate': ['some noise ok', 'moderate barking', 'normal noise'],
'high': ['loud ok', 'barking fine', 'noise tolerant', 'doesn\'t mind noise']
},
'size_preference': {
'small': ['small', 'tiny', 'little', 'compact', 'lap dog', 'petite', 'miniature'],
'medium': ['medium', 'moderate size', 'average', 'mid-size'],
'large': ['large', 'big', 'huge', 'giant', 'massive', 'substantial'],
'varies': ['any size', 'size doesn\'t matter', 'flexible on size']
},
'experience_level': {
'beginner': ['first time', 'beginner', 'new to dogs', 'inexperienced', 'never had'],
'some': ['some experience', 'had dogs before', 'moderate experience'],
'experienced': ['experienced', 'expert', 'very experienced', 'professional', 'trainer']
},
'grooming_commitment': {
'low': ['low maintenance', 'easy care', 'minimal grooming', 'wash and go'],
'moderate': ['moderate grooming', 'some brushing', 'regular care'],
'high': ['high maintenance', 'lots of grooming', 'professional grooming', 'daily brushing']
},
'special_needs': {
'guard': ['guard dog', 'protection', 'security', 'watchdog', 'guardian'],
'therapy': ['therapy dog', 'emotional support', 'comfort', 'calm companion'],
'hypoallergenic': ['hypoallergenic', 'allergies', 'non-shedding', 'allergy friendly'],
'working': ['working dog', 'job', 'task', 'service dog'],
'companion': ['companion', 'friend', 'buddy', 'lap dog', 'cuddle']
}
}
# Comparative preference keywords
self.preference_indicators = {
'love': 1.0,
'prefer': 0.9,
'like': 0.8,
'want': 0.8,
'interested in': 0.7,
'considering': 0.6,
'ok with': 0.5,
'don\'t mind': 0.4,
'not interested': 0.2,
'dislike': 0.1,
'hate': 0.0
}
# Order keywords
self.order_keywords = {
'first': 1.0, 'most': 1.0, 'primary': 1.0, 'main': 1.0,
'second': 0.8, 'then': 0.8, 'next': 0.8,
'third': 0.6, 'also': 0.6, 'additionally': 0.6,
'last': 0.4, 'least': 0.4, 'finally': 0.4
}
def preprocess_text(self, text: str) -> str:
"""
Text preprocessing
Args:
text: Raw text
Returns:
Preprocessed text
"""
if not text:
return ""
# Convert to lowercase
text = text.lower().strip()
# Remove punctuation (keep some meaningful ones)
text = re.sub(r'[^\w\s\-\']', ' ', text)
# Handle extra whitespace
text = re.sub(r'\s+', ' ', text)
return text
def extract_breed_mentions(self, text: str) -> List[Tuple[str, float]]:
"""
Extract mentioned breeds and their preference levels from text
Args:
text: Input text
Returns:
List of (breed_name, preference_score) tuples
"""
text = self.preprocess_text(text)
breed_mentions = []
try:
# Check each breed alias
for alias, standard_breed in self.breed_aliases.items():
if alias in text:
# Find surrounding preference indicators
preference_score = self._find_preference_score(text, alias)
if isinstance(standard_breed, list):
# If alias maps to multiple breeds, add all
for breed in standard_breed:
breed_mentions.append((breed, preference_score))
else:
breed_mentions.append((standard_breed, preference_score))
# Deduplicate and merge scores
breed_scores = {}
for breed, score in breed_mentions:
if breed in breed_scores:
breed_scores[breed] = max(breed_scores[breed], score)
else:
breed_scores[breed] = score
return list(breed_scores.items())
except Exception as e:
print(f"Error extracting breed mentions: {str(e)}")
return []
def _find_preference_score(self, text: str, breed_mention: str) -> float:
"""
Find preference score near breed mention
Args:
text: Text
breed_mention: Breed mention
Returns:
Preference score (0.0-1.0)
"""
try:
# Find breed mention position
mention_pos = text.find(breed_mention)
if mention_pos == -1:
return 0.5 # Default neutral score
# Check context (50 characters before and after)
context_start = max(0, mention_pos - 50)
context_end = min(len(text), mention_pos + len(breed_mention) + 50)
context = text[context_start:context_end]
# Find preference indicators
max_score = 0.5 # Default score
for indicator, score in self.preference_indicators.items():
if indicator in context:
max_score = max(max_score, score)
# Find order keywords
for order_word, multiplier in self.order_keywords.items():
if order_word in context:
max_score = max(max_score, max_score * multiplier)
return max_score
except Exception as e:
print(f"Error finding preference score: {str(e)}")
return 0.5
def extract_lifestyle_preferences(self, text: str) -> Dict[str, Dict[str, float]]:
"""
Extract lifestyle preferences from text
Args:
text: Input text
Returns:
Lifestyle preferences dictionary
"""
text = self.preprocess_text(text)
preferences = {}
try:
for category, keywords_dict in self.lifestyle_keywords.items():
preferences[category] = {}
for preference_type, keywords in keywords_dict.items():
score = 0.0
count = 0
for keyword in keywords:
if keyword in text:
# Calculate keyword occurrence intensity
keyword_count = text.count(keyword)
score += keyword_count
count += keyword_count
if count > 0:
# Normalize score
preferences[category][preference_type] = min(score / max(count, 1), 1.0)
return preferences
except Exception as e:
print(f"Error extracting lifestyle preferences: {str(e)}")
return {}
def generate_search_keywords(self, text: str) -> List[str]:
"""
Generate keyword list for search
Args:
text: Input text
Returns:
List of keywords
"""
text = self.preprocess_text(text)
keywords = []
try:
# Tokenize and filter stop words
words = text.split()
for word in words:
if len(word) > 2 and word not in self.stop_words:
keywords.append(word)
# Extract important phrases
phrases = self._extract_phrases(text)
keywords.extend(phrases)
# Remove duplicates
keywords = list(set(keywords))
return keywords
except Exception as e:
print(f"Error generating search keywords: {str(e)}")
return []
def _extract_phrases(self, text: str) -> List[str]:
"""
Extract important phrases
Args:
text: Input text
Returns:
List of phrases
"""
phrases = []
# Define important phrase patterns
phrase_patterns = [
r'good with \w+',
r'apartment \w+',
r'family \w+',
r'exercise \w+',
r'grooming \w+',
r'noise \w+',
r'training \w+',
r'health \w+',
r'\w+ friendly',
r'\w+ tolerant',
r'\w+ maintenance',
r'\w+ energy',
r'\w+ barking',
r'\w+ shedding'
]
for pattern in phrase_patterns:
matches = re.findall(pattern, text)
phrases.extend(matches)
return phrases
def analyze_sentiment(self, text: str) -> Dict[str, float]:
"""
Analyze text sentiment
Args:
text: Input text
Returns:
Sentiment analysis results {'positive': 0.0-1.0, 'negative': 0.0-1.0, 'neutral': 0.0-1.0}
"""
text = self.preprocess_text(text)
positive_words = [
'love', 'like', 'want', 'prefer', 'good', 'great', 'excellent',
'perfect', 'ideal', 'wonderful', 'amazing', 'fantastic'
]
negative_words = [
'hate', 'dislike', 'bad', 'terrible', 'awful', 'horrible',
'not good', 'don\'t want', 'avoid', 'against', 'problem'
]
positive_count = sum(1 for word in positive_words if word in text)
negative_count = sum(1 for word in negative_words if word in text)
total_words = len(text.split())
if total_words == 0:
return {'positive': 0.0, 'negative': 0.0, 'neutral': 1.0}
positive_ratio = positive_count / total_words
negative_ratio = negative_count / total_words
neutral_ratio = 1.0 - positive_ratio - negative_ratio
return {
'positive': positive_ratio,
'negative': negative_ratio,
'neutral': max(0.0, neutral_ratio)
}
def extract_implicit_preferences(self, text: str) -> Dict[str, Any]:
"""
Extract implicit preferences from text
Args:
text: Input text
Returns:
Dictionary of implicit preferences
"""
text = self.preprocess_text(text)
implicit_prefs = {}
try:
# Infer preferences from mentioned activities
if any(activity in text for activity in ['hiking', 'running', 'jogging', 'outdoor']):
implicit_prefs['exercise_needs'] = 'high'
implicit_prefs['size_preference'] = 'medium_to_large'
# Infer from living environment
if any(env in text for env in ['apartment', 'small space', 'city']):
implicit_prefs['size_preference'] = 'small_to_medium'
implicit_prefs['noise_tolerance'] = 'low'
implicit_prefs['exercise_needs'] = 'moderate'
# Infer from family situation
if 'children' in text or 'kids' in text:
implicit_prefs['temperament'] = 'gentle_patient'
implicit_prefs['good_with_children'] = True
# Infer from experience level
if any(exp in text for exp in ['first time', 'beginner', 'new to']):
implicit_prefs['care_level'] = 'low_to_moderate'
implicit_prefs['training_difficulty'] = 'easy'
# Infer from time commitment
if any(time in text for time in ['busy', 'no time', 'low maintenance']):
implicit_prefs['grooming_needs'] = 'low'
implicit_prefs['care_level'] = 'low'
implicit_prefs['exercise_needs'] = 'low_to_moderate'
return implicit_prefs
except Exception as e:
print(f"Error extracting implicit preferences: {str(e)}")
return {}
def validate_input(self, text: str) -> Dict[str, Any]:
"""
Validate input text validity
Args:
text: Input text
Returns:
Validation results dictionary
"""
if not text or not text.strip():
return {
'is_valid': False,
'error': 'Empty input',
'suggestions': ['Please provide a description of your preferences']
}
text = text.strip()
# Check length
if len(text) < 10:
return {
'is_valid': False,
'error': 'Input too short',
'suggestions': ['Please provide more details about your preferences']
}
if len(text) > 1000:
return {
'is_valid': False,
'error': 'Input too long',
'suggestions': ['Please provide a more concise description']
}
# Check for meaningful content
processed_text = self.preprocess_text(text)
meaningful_words = [word for word in processed_text.split()
if len(word) > 2 and word not in self.stop_words]
if len(meaningful_words) < 3:
return {
'is_valid': False,
'error': 'Not enough meaningful content',
'suggestions': ['Please provide more specific details about your lifestyle and preferences']
}
return {
'is_valid': True,
'word_count': len(meaningful_words),
'suggestions': []
}
def get_nlp_processor():
"""Get natural language processor instance"""
try:
return NaturalLanguageProcessor()
except Exception as e:
print(f"Error creating NLP processor: {str(e)}")
return None