Spaces:
Running
on
Zero
Running
on
Zero
import re | |
import string | |
from typing import Dict, List, Tuple, Optional, Any | |
import traceback | |
class NaturalLanguageProcessor: | |
""" | |
Natural language processing utility class | |
Handles text preprocessing and keyword extraction for user input | |
""" | |
def __init__(self): | |
"""Initialize the natural language processor""" | |
self.stop_words = { | |
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', | |
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', | |
'to', 'was', 'will', 'with', 'would', 'i', 'me', 'my', 'we', 'us', | |
'our', 'you', 'your', 'they', 'them', 'their' | |
} | |
# Breed name mappings (common aliases to standard names) | |
self.breed_aliases = { | |
'lab': 'labrador_retriever', | |
'labrador': 'labrador_retriever', | |
'golden': 'golden_retriever', | |
'retriever': ['labrador_retriever', 'golden_retriever'], | |
'german shepherd': 'german_shepherd', | |
'shepherd': 'german_shepherd', | |
'border collie': 'border_collie', | |
'collie': ['border_collie', 'collie'], | |
'bulldog': ['french_bulldog', 'english_bulldog'], | |
'french bulldog': 'french_bulldog', | |
'poodle': ['standard_poodle', 'miniature_poodle', 'toy_poodle'], | |
'husky': 'siberian_husky', | |
'siberian husky': 'siberian_husky', | |
'beagle': 'beagle', | |
'yorkshire terrier': 'yorkshire_terrier', | |
'yorkie': 'yorkshire_terrier', | |
'chihuahua': 'chihuahua', | |
'dachshund': 'dachshund', | |
'wiener dog': 'dachshund', | |
'rottweiler': 'rottweiler', | |
'rottie': 'rottweiler', | |
'boxer': 'boxer', | |
'great dane': 'great_dane', | |
'dane': 'great_dane', | |
'mastiff': ['bull_mastiff', 'tibetan_mastiff'], | |
'pitbull': 'american_staffordshire_terrier', | |
'pit bull': 'american_staffordshire_terrier', | |
'shih tzu': 'shih-tzu', | |
'maltese': 'maltese_dog', | |
'pug': 'pug', | |
'basset hound': 'basset', | |
'bloodhound': 'bloodhound', | |
'australian shepherd': 'kelpie', | |
'aussie': 'kelpie' | |
} | |
# Lifestyle keyword mappings | |
self.lifestyle_keywords = { | |
'living_space': { | |
'apartment': ['apartment', 'flat', 'condo', 'small space', 'city living', 'urban'], | |
'house': ['house', 'home', 'yard', 'garden', 'suburban', 'large space'], | |
'farm': ['farm', 'rural', 'country', 'acreage', 'ranch'] | |
}, | |
'activity_level': { | |
'very_high': ['very active', 'extremely energetic', 'marathon runner', 'athlete'], | |
'high': ['active', 'energetic', 'exercise', 'hiking', 'running', 'outdoor activities', | |
'sports', 'jogging', 'biking', 'adventure'], | |
'moderate': ['moderate exercise', 'some activity', 'weekend walks', 'occasional exercise'], | |
'low': ['calm', 'lazy', 'indoor', 'low energy', 'couch potato', 'sedentary', 'quiet lifestyle'] | |
}, | |
'family_situation': { | |
'children': ['children', 'kids', 'toddlers', 'babies', 'family with children', 'young family'], | |
'elderly': ['elderly', 'senior', 'old', 'retired', 'senior citizen'], | |
'single': ['single', 'alone', 'individual', 'bachelor', 'solo'], | |
'couple': ['couple', 'two people', 'pair', 'duo'] | |
}, | |
'noise_tolerance': { | |
'low': ['quiet', 'silent', 'noise-sensitive', 'peaceful', 'no barking', 'minimal noise'], | |
'moderate': ['some noise ok', 'moderate barking', 'normal noise'], | |
'high': ['loud ok', 'barking fine', 'noise tolerant', 'doesn\'t mind noise'] | |
}, | |
'size_preference': { | |
'small': ['small', 'tiny', 'little', 'compact', 'lap dog', 'petite', 'miniature'], | |
'medium': ['medium', 'moderate size', 'average', 'mid-size'], | |
'large': ['large', 'big', 'huge', 'giant', 'massive', 'substantial'], | |
'varies': ['any size', 'size doesn\'t matter', 'flexible on size'] | |
}, | |
'experience_level': { | |
'beginner': ['first time', 'beginner', 'new to dogs', 'inexperienced', 'never had'], | |
'some': ['some experience', 'had dogs before', 'moderate experience'], | |
'experienced': ['experienced', 'expert', 'very experienced', 'professional', 'trainer'] | |
}, | |
'grooming_commitment': { | |
'low': ['low maintenance', 'easy care', 'minimal grooming', 'wash and go'], | |
'moderate': ['moderate grooming', 'some brushing', 'regular care'], | |
'high': ['high maintenance', 'lots of grooming', 'professional grooming', 'daily brushing'] | |
}, | |
'special_needs': { | |
'guard': ['guard dog', 'protection', 'security', 'watchdog', 'guardian'], | |
'therapy': ['therapy dog', 'emotional support', 'comfort', 'calm companion'], | |
'hypoallergenic': ['hypoallergenic', 'allergies', 'non-shedding', 'allergy friendly'], | |
'working': ['working dog', 'job', 'task', 'service dog'], | |
'companion': ['companion', 'friend', 'buddy', 'lap dog', 'cuddle'] | |
} | |
} | |
# Comparative preference keywords | |
self.preference_indicators = { | |
'love': 1.0, | |
'prefer': 0.9, | |
'like': 0.8, | |
'want': 0.8, | |
'interested in': 0.7, | |
'considering': 0.6, | |
'ok with': 0.5, | |
'don\'t mind': 0.4, | |
'not interested': 0.2, | |
'dislike': 0.1, | |
'hate': 0.0 | |
} | |
# Order keywords | |
self.order_keywords = { | |
'first': 1.0, 'most': 1.0, 'primary': 1.0, 'main': 1.0, | |
'second': 0.8, 'then': 0.8, 'next': 0.8, | |
'third': 0.6, 'also': 0.6, 'additionally': 0.6, | |
'last': 0.4, 'least': 0.4, 'finally': 0.4 | |
} | |
def preprocess_text(self, text: str) -> str: | |
""" | |
Text preprocessing | |
Args: | |
text: Raw text | |
Returns: | |
Preprocessed text | |
""" | |
if not text: | |
return "" | |
# Convert to lowercase | |
text = text.lower().strip() | |
# Remove punctuation (keep some meaningful ones) | |
text = re.sub(r'[^\w\s\-\']', ' ', text) | |
# Handle extra whitespace | |
text = re.sub(r'\s+', ' ', text) | |
return text | |
def extract_breed_mentions(self, text: str) -> List[Tuple[str, float]]: | |
""" | |
Extract mentioned breeds and their preference levels from text | |
Args: | |
text: Input text | |
Returns: | |
List of (breed_name, preference_score) tuples | |
""" | |
text = self.preprocess_text(text) | |
breed_mentions = [] | |
try: | |
# Check each breed alias | |
for alias, standard_breed in self.breed_aliases.items(): | |
if alias in text: | |
# Find surrounding preference indicators | |
preference_score = self._find_preference_score(text, alias) | |
if isinstance(standard_breed, list): | |
# If alias maps to multiple breeds, add all | |
for breed in standard_breed: | |
breed_mentions.append((breed, preference_score)) | |
else: | |
breed_mentions.append((standard_breed, preference_score)) | |
# Deduplicate and merge scores | |
breed_scores = {} | |
for breed, score in breed_mentions: | |
if breed in breed_scores: | |
breed_scores[breed] = max(breed_scores[breed], score) | |
else: | |
breed_scores[breed] = score | |
return list(breed_scores.items()) | |
except Exception as e: | |
print(f"Error extracting breed mentions: {str(e)}") | |
return [] | |
def _find_preference_score(self, text: str, breed_mention: str) -> float: | |
""" | |
Find preference score near breed mention | |
Args: | |
text: Text | |
breed_mention: Breed mention | |
Returns: | |
Preference score (0.0-1.0) | |
""" | |
try: | |
# Find breed mention position | |
mention_pos = text.find(breed_mention) | |
if mention_pos == -1: | |
return 0.5 # Default neutral score | |
# Check context (50 characters before and after) | |
context_start = max(0, mention_pos - 50) | |
context_end = min(len(text), mention_pos + len(breed_mention) + 50) | |
context = text[context_start:context_end] | |
# Find preference indicators | |
max_score = 0.5 # Default score | |
for indicator, score in self.preference_indicators.items(): | |
if indicator in context: | |
max_score = max(max_score, score) | |
# Find order keywords | |
for order_word, multiplier in self.order_keywords.items(): | |
if order_word in context: | |
max_score = max(max_score, max_score * multiplier) | |
return max_score | |
except Exception as e: | |
print(f"Error finding preference score: {str(e)}") | |
return 0.5 | |
def extract_lifestyle_preferences(self, text: str) -> Dict[str, Dict[str, float]]: | |
""" | |
Extract lifestyle preferences from text | |
Args: | |
text: Input text | |
Returns: | |
Lifestyle preferences dictionary | |
""" | |
text = self.preprocess_text(text) | |
preferences = {} | |
try: | |
for category, keywords_dict in self.lifestyle_keywords.items(): | |
preferences[category] = {} | |
for preference_type, keywords in keywords_dict.items(): | |
score = 0.0 | |
count = 0 | |
for keyword in keywords: | |
if keyword in text: | |
# Calculate keyword occurrence intensity | |
keyword_count = text.count(keyword) | |
score += keyword_count | |
count += keyword_count | |
if count > 0: | |
# Normalize score | |
preferences[category][preference_type] = min(score / max(count, 1), 1.0) | |
return preferences | |
except Exception as e: | |
print(f"Error extracting lifestyle preferences: {str(e)}") | |
return {} | |
def generate_search_keywords(self, text: str) -> List[str]: | |
""" | |
Generate keyword list for search | |
Args: | |
text: Input text | |
Returns: | |
List of keywords | |
""" | |
text = self.preprocess_text(text) | |
keywords = [] | |
try: | |
# Tokenize and filter stop words | |
words = text.split() | |
for word in words: | |
if len(word) > 2 and word not in self.stop_words: | |
keywords.append(word) | |
# Extract important phrases | |
phrases = self._extract_phrases(text) | |
keywords.extend(phrases) | |
# Remove duplicates | |
keywords = list(set(keywords)) | |
return keywords | |
except Exception as e: | |
print(f"Error generating search keywords: {str(e)}") | |
return [] | |
def _extract_phrases(self, text: str) -> List[str]: | |
""" | |
Extract important phrases | |
Args: | |
text: Input text | |
Returns: | |
List of phrases | |
""" | |
phrases = [] | |
# Define important phrase patterns | |
phrase_patterns = [ | |
r'good with \w+', | |
r'apartment \w+', | |
r'family \w+', | |
r'exercise \w+', | |
r'grooming \w+', | |
r'noise \w+', | |
r'training \w+', | |
r'health \w+', | |
r'\w+ friendly', | |
r'\w+ tolerant', | |
r'\w+ maintenance', | |
r'\w+ energy', | |
r'\w+ barking', | |
r'\w+ shedding' | |
] | |
for pattern in phrase_patterns: | |
matches = re.findall(pattern, text) | |
phrases.extend(matches) | |
return phrases | |
def analyze_sentiment(self, text: str) -> Dict[str, float]: | |
""" | |
Analyze text sentiment | |
Args: | |
text: Input text | |
Returns: | |
Sentiment analysis results {'positive': 0.0-1.0, 'negative': 0.0-1.0, 'neutral': 0.0-1.0} | |
""" | |
text = self.preprocess_text(text) | |
positive_words = [ | |
'love', 'like', 'want', 'prefer', 'good', 'great', 'excellent', | |
'perfect', 'ideal', 'wonderful', 'amazing', 'fantastic' | |
] | |
negative_words = [ | |
'hate', 'dislike', 'bad', 'terrible', 'awful', 'horrible', | |
'not good', 'don\'t want', 'avoid', 'against', 'problem' | |
] | |
positive_count = sum(1 for word in positive_words if word in text) | |
negative_count = sum(1 for word in negative_words if word in text) | |
total_words = len(text.split()) | |
if total_words == 0: | |
return {'positive': 0.0, 'negative': 0.0, 'neutral': 1.0} | |
positive_ratio = positive_count / total_words | |
negative_ratio = negative_count / total_words | |
neutral_ratio = 1.0 - positive_ratio - negative_ratio | |
return { | |
'positive': positive_ratio, | |
'negative': negative_ratio, | |
'neutral': max(0.0, neutral_ratio) | |
} | |
def extract_implicit_preferences(self, text: str) -> Dict[str, Any]: | |
""" | |
Extract implicit preferences from text | |
Args: | |
text: Input text | |
Returns: | |
Dictionary of implicit preferences | |
""" | |
text = self.preprocess_text(text) | |
implicit_prefs = {} | |
try: | |
# Infer preferences from mentioned activities | |
if any(activity in text for activity in ['hiking', 'running', 'jogging', 'outdoor']): | |
implicit_prefs['exercise_needs'] = 'high' | |
implicit_prefs['size_preference'] = 'medium_to_large' | |
# Infer from living environment | |
if any(env in text for env in ['apartment', 'small space', 'city']): | |
implicit_prefs['size_preference'] = 'small_to_medium' | |
implicit_prefs['noise_tolerance'] = 'low' | |
implicit_prefs['exercise_needs'] = 'moderate' | |
# Infer from family situation | |
if 'children' in text or 'kids' in text: | |
implicit_prefs['temperament'] = 'gentle_patient' | |
implicit_prefs['good_with_children'] = True | |
# Infer from experience level | |
if any(exp in text for exp in ['first time', 'beginner', 'new to']): | |
implicit_prefs['care_level'] = 'low_to_moderate' | |
implicit_prefs['training_difficulty'] = 'easy' | |
# Infer from time commitment | |
if any(time in text for time in ['busy', 'no time', 'low maintenance']): | |
implicit_prefs['grooming_needs'] = 'low' | |
implicit_prefs['care_level'] = 'low' | |
implicit_prefs['exercise_needs'] = 'low_to_moderate' | |
return implicit_prefs | |
except Exception as e: | |
print(f"Error extracting implicit preferences: {str(e)}") | |
return {} | |
def validate_input(self, text: str) -> Dict[str, Any]: | |
""" | |
Validate input text validity | |
Args: | |
text: Input text | |
Returns: | |
Validation results dictionary | |
""" | |
if not text or not text.strip(): | |
return { | |
'is_valid': False, | |
'error': 'Empty input', | |
'suggestions': ['Please provide a description of your preferences'] | |
} | |
text = text.strip() | |
# Check length | |
if len(text) < 10: | |
return { | |
'is_valid': False, | |
'error': 'Input too short', | |
'suggestions': ['Please provide more details about your preferences'] | |
} | |
if len(text) > 1000: | |
return { | |
'is_valid': False, | |
'error': 'Input too long', | |
'suggestions': ['Please provide a more concise description'] | |
} | |
# Check for meaningful content | |
processed_text = self.preprocess_text(text) | |
meaningful_words = [word for word in processed_text.split() | |
if len(word) > 2 and word not in self.stop_words] | |
if len(meaningful_words) < 3: | |
return { | |
'is_valid': False, | |
'error': 'Not enough meaningful content', | |
'suggestions': ['Please provide more specific details about your lifestyle and preferences'] | |
} | |
return { | |
'is_valid': True, | |
'word_count': len(meaningful_words), | |
'suggestions': [] | |
} | |
def get_nlp_processor(): | |
"""Get natural language processor instance""" | |
try: | |
return NaturalLanguageProcessor() | |
except Exception as e: | |
print(f"Error creating NLP processor: {str(e)}") | |
return None | |