Spaces:

DawnC
/

PawMatchAI

Running on Zero

App Files Files Community

PawMatchAI / natural_language_processor.py

DawnC

Upload 18 files

1e4c9bc verified about 22 hours ago

raw

history blame contribute delete

17.6 kB

	import re
	import string
	from typing import Dict, List, Tuple, Optional, Any
	import traceback

	class NaturalLanguageProcessor:
	"""
	Natural language processing utility class
	Handles text preprocessing and keyword extraction for user input
	"""

	def __init__(self):
	"""Initialize the natural language processor"""
	self.stop_words = {
	'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
	'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
	'to', 'was', 'will', 'with', 'would', 'i', 'me', 'my', 'we', 'us',
	'our', 'you', 'your', 'they', 'them', 'their'
	}

	# Breed name mappings (common aliases to standard names)
	self.breed_aliases = {
	'lab': 'labrador_retriever',
	'labrador': 'labrador_retriever',
	'golden': 'golden_retriever',
	'retriever': ['labrador_retriever', 'golden_retriever'],
	'german shepherd': 'german_shepherd',
	'shepherd': 'german_shepherd',
	'border collie': 'border_collie',
	'collie': ['border_collie', 'collie'],
	'bulldog': ['french_bulldog', 'english_bulldog'],
	'french bulldog': 'french_bulldog',
	'poodle': ['standard_poodle', 'miniature_poodle', 'toy_poodle'],
	'husky': 'siberian_husky',
	'siberian husky': 'siberian_husky',
	'beagle': 'beagle',
	'yorkshire terrier': 'yorkshire_terrier',
	'yorkie': 'yorkshire_terrier',
	'chihuahua': 'chihuahua',
	'dachshund': 'dachshund',
	'wiener dog': 'dachshund',
	'rottweiler': 'rottweiler',
	'rottie': 'rottweiler',
	'boxer': 'boxer',
	'great dane': 'great_dane',
	'dane': 'great_dane',
	'mastiff': ['bull_mastiff', 'tibetan_mastiff'],
	'pitbull': 'american_staffordshire_terrier',
	'pit bull': 'american_staffordshire_terrier',
	'shih tzu': 'shih-tzu',
	'maltese': 'maltese_dog',
	'pug': 'pug',
	'basset hound': 'basset',
	'bloodhound': 'bloodhound',
	'australian shepherd': 'kelpie',
	'aussie': 'kelpie'
	}

	# Lifestyle keyword mappings
	self.lifestyle_keywords = {
	'living_space': {
	'apartment': ['apartment', 'flat', 'condo', 'small space', 'city living', 'urban'],
	'house': ['house', 'home', 'yard', 'garden', 'suburban', 'large space'],
	'farm': ['farm', 'rural', 'country', 'acreage', 'ranch']
	},
	'activity_level': {
	'very_high': ['very active', 'extremely energetic', 'marathon runner', 'athlete'],
	'high': ['active', 'energetic', 'exercise', 'hiking', 'running', 'outdoor activities',
	'sports', 'jogging', 'biking', 'adventure'],
	'moderate': ['moderate exercise', 'some activity', 'weekend walks', 'occasional exercise'],
	'low': ['calm', 'lazy', 'indoor', 'low energy', 'couch potato', 'sedentary', 'quiet lifestyle']
	},
	'family_situation': {
	'children': ['children', 'kids', 'toddlers', 'babies', 'family with children', 'young family'],
	'elderly': ['elderly', 'senior', 'old', 'retired', 'senior citizen'],
	'single': ['single', 'alone', 'individual', 'bachelor', 'solo'],
	'couple': ['couple', 'two people', 'pair', 'duo']
	},
	'noise_tolerance': {
	'low': ['quiet', 'silent', 'noise-sensitive', 'peaceful', 'no barking', 'minimal noise'],
	'moderate': ['some noise ok', 'moderate barking', 'normal noise'],
	'high': ['loud ok', 'barking fine', 'noise tolerant', 'doesn\'t mind noise']
	},
	'size_preference': {
	'small': ['small', 'tiny', 'little', 'compact', 'lap dog', 'petite', 'miniature'],
	'medium': ['medium', 'moderate size', 'average', 'mid-size'],
	'large': ['large', 'big', 'huge', 'giant', 'massive', 'substantial'],
	'varies': ['any size', 'size doesn\'t matter', 'flexible on size']
	},
	'experience_level': {
	'beginner': ['first time', 'beginner', 'new to dogs', 'inexperienced', 'never had'],
	'some': ['some experience', 'had dogs before', 'moderate experience'],
	'experienced': ['experienced', 'expert', 'very experienced', 'professional', 'trainer']
	},
	'grooming_commitment': {
	'low': ['low maintenance', 'easy care', 'minimal grooming', 'wash and go'],
	'moderate': ['moderate grooming', 'some brushing', 'regular care'],
	'high': ['high maintenance', 'lots of grooming', 'professional grooming', 'daily brushing']
	},
	'special_needs': {
	'guard': ['guard dog', 'protection', 'security', 'watchdog', 'guardian'],
	'therapy': ['therapy dog', 'emotional support', 'comfort', 'calm companion'],
	'hypoallergenic': ['hypoallergenic', 'allergies', 'non-shedding', 'allergy friendly'],
	'working': ['working dog', 'job', 'task', 'service dog'],
	'companion': ['companion', 'friend', 'buddy', 'lap dog', 'cuddle']
	}
	}

	# Comparative preference keywords
	self.preference_indicators = {
	'love': 1.0,
	'prefer': 0.9,
	'like': 0.8,
	'want': 0.8,
	'interested in': 0.7,
	'considering': 0.6,
	'ok with': 0.5,
	'don\'t mind': 0.4,
	'not interested': 0.2,
	'dislike': 0.1,
	'hate': 0.0
	}

	# Order keywords
	self.order_keywords = {
	'first': 1.0, 'most': 1.0, 'primary': 1.0, 'main': 1.0,
	'second': 0.8, 'then': 0.8, 'next': 0.8,
	'third': 0.6, 'also': 0.6, 'additionally': 0.6,
	'last': 0.4, 'least': 0.4, 'finally': 0.4
	}

	def preprocess_text(self, text: str) -> str:
	"""
	Text preprocessing

	Args:
	text: Raw text

	Returns:
	Preprocessed text
	"""
	if not text:
	return ""

	# Convert to lowercase
	text = text.lower().strip()

	# Remove punctuation (keep some meaningful ones)
	text = re.sub(r'[^\w\s\-\']', ' ', text)

	# Handle extra whitespace
	text = re.sub(r'\s+', ' ', text)

	return text

	def extract_breed_mentions(self, text: str) -> List[Tuple[str, float]]:
	"""
	Extract mentioned breeds and their preference levels from text

	Args:
	text: Input text

	Returns:
	List of (breed_name, preference_score) tuples
	"""
	text = self.preprocess_text(text)
	breed_mentions = []

	try:
	# Check each breed alias
	for alias, standard_breed in self.breed_aliases.items():
	if alias in text:
	# Find surrounding preference indicators
	preference_score = self._find_preference_score(text, alias)

	if isinstance(standard_breed, list):
	# If alias maps to multiple breeds, add all
	for breed in standard_breed:
	breed_mentions.append((breed, preference_score))
	else:
	breed_mentions.append((standard_breed, preference_score))

	# Deduplicate and merge scores
	breed_scores = {}
	for breed, score in breed_mentions:
	if breed in breed_scores:
	breed_scores[breed] = max(breed_scores[breed], score)
	else:
	breed_scores[breed] = score

	return list(breed_scores.items())

	except Exception as e:
	print(f"Error extracting breed mentions: {str(e)}")
	return []

	def _find_preference_score(self, text: str, breed_mention: str) -> float:
	"""
	Find preference score near breed mention

	Args:
	text: Text
	breed_mention: Breed mention

	Returns:
	Preference score (0.0-1.0)
	"""
	try:
	# Find breed mention position
	mention_pos = text.find(breed_mention)
	if mention_pos == -1:
	return 0.5 # Default neutral score

	# Check context (50 characters before and after)
	context_start = max(0, mention_pos - 50)
	context_end = min(len(text), mention_pos + len(breed_mention) + 50)
	context = text[context_start:context_end]

	# Find preference indicators
	max_score = 0.5 # Default score

	for indicator, score in self.preference_indicators.items():
	if indicator in context:
	max_score = max(max_score, score)

	# Find order keywords
	for order_word, multiplier in self.order_keywords.items():
	if order_word in context:
	max_score = max(max_score, max_score * multiplier)

	return max_score

	except Exception as e:
	print(f"Error finding preference score: {str(e)}")
	return 0.5

	def extract_lifestyle_preferences(self, text: str) -> Dict[str, Dict[str, float]]:
	"""
	Extract lifestyle preferences from text

	Args:
	text: Input text

	Returns:
	Lifestyle preferences dictionary
	"""
	text = self.preprocess_text(text)
	preferences = {}

	try:
	for category, keywords_dict in self.lifestyle_keywords.items():
	preferences[category] = {}

	for preference_type, keywords in keywords_dict.items():
	score = 0.0
	count = 0

	for keyword in keywords:
	if keyword in text:
	# Calculate keyword occurrence intensity
	keyword_count = text.count(keyword)
	score += keyword_count
	count += keyword_count

	if count > 0:
	# Normalize score
	preferences[category][preference_type] = min(score / max(count, 1), 1.0)

	return preferences

	except Exception as e:
	print(f"Error extracting lifestyle preferences: {str(e)}")
	return {}

	def generate_search_keywords(self, text: str) -> List[str]:
	"""
	Generate keyword list for search

	Args:
	text: Input text

	Returns:
	List of keywords
	"""
	text = self.preprocess_text(text)
	keywords = []

	try:
	# Tokenize and filter stop words
	words = text.split()
	for word in words:
	if len(word) > 2 and word not in self.stop_words:
	keywords.append(word)

	# Extract important phrases
	phrases = self._extract_phrases(text)
	keywords.extend(phrases)

	# Remove duplicates
	keywords = list(set(keywords))

	return keywords

	except Exception as e:
	print(f"Error generating search keywords: {str(e)}")
	return []

	def _extract_phrases(self, text: str) -> List[str]:
	"""
	Extract important phrases

	Args:
	text: Input text

	Returns:
	List of phrases
	"""
	phrases = []

	# Define important phrase patterns
	phrase_patterns = [
	r'good with \w+',
	r'apartment \w+',
	r'family \w+',
	r'exercise \w+',
	r'grooming \w+',
	r'noise \w+',
	r'training \w+',
	r'health \w+',
	r'\w+ friendly',
	r'\w+ tolerant',
	r'\w+ maintenance',
	r'\w+ energy',
	r'\w+ barking',
	r'\w+ shedding'
	]

	for pattern in phrase_patterns:
	matches = re.findall(pattern, text)
	phrases.extend(matches)

	return phrases

	def analyze_sentiment(self, text: str) -> Dict[str, float]:
	"""
	Analyze text sentiment

	Args:
	text: Input text

	Returns:
	Sentiment analysis results {'positive': 0.0-1.0, 'negative': 0.0-1.0, 'neutral': 0.0-1.0}
	"""
	text = self.preprocess_text(text)

	positive_words = [
	'love', 'like', 'want', 'prefer', 'good', 'great', 'excellent',
	'perfect', 'ideal', 'wonderful', 'amazing', 'fantastic'
	]

	negative_words = [
	'hate', 'dislike', 'bad', 'terrible', 'awful', 'horrible',
	'not good', 'don\'t want', 'avoid', 'against', 'problem'
	]

	positive_count = sum(1 for word in positive_words if word in text)
	negative_count = sum(1 for word in negative_words if word in text)
	total_words = len(text.split())

	if total_words == 0:
	return {'positive': 0.0, 'negative': 0.0, 'neutral': 1.0}

	positive_ratio = positive_count / total_words
	negative_ratio = negative_count / total_words
	neutral_ratio = 1.0 - positive_ratio - negative_ratio

	return {
	'positive': positive_ratio,
	'negative': negative_ratio,
	'neutral': max(0.0, neutral_ratio)
	}

	def extract_implicit_preferences(self, text: str) -> Dict[str, Any]:
	"""
	Extract implicit preferences from text

	Args:
	text: Input text

	Returns:
	Dictionary of implicit preferences
	"""
	text = self.preprocess_text(text)
	implicit_prefs = {}

	try:
	# Infer preferences from mentioned activities
	if any(activity in text for activity in ['hiking', 'running', 'jogging', 'outdoor']):
	implicit_prefs['exercise_needs'] = 'high'
	implicit_prefs['size_preference'] = 'medium_to_large'

	# Infer from living environment
	if any(env in text for env in ['apartment', 'small space', 'city']):
	implicit_prefs['size_preference'] = 'small_to_medium'
	implicit_prefs['noise_tolerance'] = 'low'
	implicit_prefs['exercise_needs'] = 'moderate'

	# Infer from family situation
	if 'children' in text or 'kids' in text:
	implicit_prefs['temperament'] = 'gentle_patient'
	implicit_prefs['good_with_children'] = True

	# Infer from experience level
	if any(exp in text for exp in ['first time', 'beginner', 'new to']):
	implicit_prefs['care_level'] = 'low_to_moderate'
	implicit_prefs['training_difficulty'] = 'easy'

	# Infer from time commitment
	if any(time in text for time in ['busy', 'no time', 'low maintenance']):
	implicit_prefs['grooming_needs'] = 'low'
	implicit_prefs['care_level'] = 'low'
	implicit_prefs['exercise_needs'] = 'low_to_moderate'

	return implicit_prefs

	except Exception as e:
	print(f"Error extracting implicit preferences: {str(e)}")
	return {}

	def validate_input(self, text: str) -> Dict[str, Any]:
	"""
	Validate input text validity

	Args:
	text: Input text

	Returns:
	Validation results dictionary
	"""
	if not text or not text.strip():
	return {
	'is_valid': False,
	'error': 'Empty input',
	'suggestions': ['Please provide a description of your preferences']
	}

	text = text.strip()

	# Check length
	if len(text) < 10:
	return {
	'is_valid': False,
	'error': 'Input too short',
	'suggestions': ['Please provide more details about your preferences']
	}

	if len(text) > 1000:
	return {
	'is_valid': False,
	'error': 'Input too long',
	'suggestions': ['Please provide a more concise description']
	}

	# Check for meaningful content
	processed_text = self.preprocess_text(text)
	meaningful_words = [word for word in processed_text.split()
	if len(word) > 2 and word not in self.stop_words]

	if len(meaningful_words) < 3:
	return {
	'is_valid': False,
	'error': 'Not enough meaningful content',
	'suggestions': ['Please provide more specific details about your lifestyle and preferences']
	}

	return {
	'is_valid': True,
	'word_count': len(meaningful_words),
	'suggestions': []
	}

	def get_nlp_processor():
	"""Get natural language processor instance"""
	try:
	return NaturalLanguageProcessor()
	except Exception as e:
	print(f"Error creating NLP processor: {str(e)}")
	return None