Spaces:

DawnC
/

PawMatchAI

Running on Zero

File size: 17,564 Bytes

1e4c9bc

import re
import string
from typing import Dict, List, Tuple, Optional, Any
import traceback

class NaturalLanguageProcessor:
    """
    Natural language processing utility class
    Handles text preprocessing and keyword extraction for user input
    """

    def __init__(self):
        """Initialize the natural language processor"""
        self.stop_words = {
            'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
            'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
            'to', 'was', 'will', 'with', 'would', 'i', 'me', 'my', 'we', 'us',
            'our', 'you', 'your', 'they', 'them', 'their'
        }

        # Breed name mappings (common aliases to standard names)
        self.breed_aliases = {
            'lab': 'labrador_retriever',
            'labrador': 'labrador_retriever',
            'golden': 'golden_retriever',
            'retriever': ['labrador_retriever', 'golden_retriever'],
            'german shepherd': 'german_shepherd',
            'shepherd': 'german_shepherd',
            'border collie': 'border_collie',
            'collie': ['border_collie', 'collie'],
            'bulldog': ['french_bulldog', 'english_bulldog'],
            'french bulldog': 'french_bulldog',
            'poodle': ['standard_poodle', 'miniature_poodle', 'toy_poodle'],
            'husky': 'siberian_husky',
            'siberian husky': 'siberian_husky',
            'beagle': 'beagle',
            'yorkshire terrier': 'yorkshire_terrier',
            'yorkie': 'yorkshire_terrier',
            'chihuahua': 'chihuahua',
            'dachshund': 'dachshund',
            'wiener dog': 'dachshund',
            'rottweiler': 'rottweiler',
            'rottie': 'rottweiler',
            'boxer': 'boxer',
            'great dane': 'great_dane',
            'dane': 'great_dane',
            'mastiff': ['bull_mastiff', 'tibetan_mastiff'],
            'pitbull': 'american_staffordshire_terrier',
            'pit bull': 'american_staffordshire_terrier',
            'shih tzu': 'shih-tzu',
            'maltese': 'maltese_dog',
            'pug': 'pug',
            'basset hound': 'basset',
            'bloodhound': 'bloodhound',
            'australian shepherd': 'kelpie',
            'aussie': 'kelpie'
        }

        # Lifestyle keyword mappings
        self.lifestyle_keywords = {
            'living_space': {
                'apartment': ['apartment', 'flat', 'condo', 'small space', 'city living', 'urban'],
                'house': ['house', 'home', 'yard', 'garden', 'suburban', 'large space'],
                'farm': ['farm', 'rural', 'country', 'acreage', 'ranch']
            },
            'activity_level': {
                'very_high': ['very active', 'extremely energetic', 'marathon runner', 'athlete'],
                'high': ['active', 'energetic', 'exercise', 'hiking', 'running', 'outdoor activities',
                        'sports', 'jogging', 'biking', 'adventure'],
                'moderate': ['moderate exercise', 'some activity', 'weekend walks', 'occasional exercise'],
                'low': ['calm', 'lazy', 'indoor', 'low energy', 'couch potato', 'sedentary', 'quiet lifestyle']
            },
            'family_situation': {
                'children': ['children', 'kids', 'toddlers', 'babies', 'family with children', 'young family'],
                'elderly': ['elderly', 'senior', 'old', 'retired', 'senior citizen'],
                'single': ['single', 'alone', 'individual', 'bachelor', 'solo'],
                'couple': ['couple', 'two people', 'pair', 'duo']
            },
            'noise_tolerance': {
                'low': ['quiet', 'silent', 'noise-sensitive', 'peaceful', 'no barking', 'minimal noise'],
                'moderate': ['some noise ok', 'moderate barking', 'normal noise'],
                'high': ['loud ok', 'barking fine', 'noise tolerant', 'doesn\'t mind noise']
            },
            'size_preference': {
                'small': ['small', 'tiny', 'little', 'compact', 'lap dog', 'petite', 'miniature'],
                'medium': ['medium', 'moderate size', 'average', 'mid-size'],
                'large': ['large', 'big', 'huge', 'giant', 'massive', 'substantial'],
                'varies': ['any size', 'size doesn\'t matter', 'flexible on size']
            },
            'experience_level': {
                'beginner': ['first time', 'beginner', 'new to dogs', 'inexperienced', 'never had'],
                'some': ['some experience', 'had dogs before', 'moderate experience'],
                'experienced': ['experienced', 'expert', 'very experienced', 'professional', 'trainer']
            },
            'grooming_commitment': {
                'low': ['low maintenance', 'easy care', 'minimal grooming', 'wash and go'],
                'moderate': ['moderate grooming', 'some brushing', 'regular care'],
                'high': ['high maintenance', 'lots of grooming', 'professional grooming', 'daily brushing']
            },
            'special_needs': {
                'guard': ['guard dog', 'protection', 'security', 'watchdog', 'guardian'],
                'therapy': ['therapy dog', 'emotional support', 'comfort', 'calm companion'],
                'hypoallergenic': ['hypoallergenic', 'allergies', 'non-shedding', 'allergy friendly'],
                'working': ['working dog', 'job', 'task', 'service dog'],
                'companion': ['companion', 'friend', 'buddy', 'lap dog', 'cuddle']
            }
        }

        # Comparative preference keywords
        self.preference_indicators = {
            'love': 1.0,
            'prefer': 0.9,
            'like': 0.8,
            'want': 0.8,
            'interested in': 0.7,
            'considering': 0.6,
            'ok with': 0.5,
            'don\'t mind': 0.4,
            'not interested': 0.2,
            'dislike': 0.1,
            'hate': 0.0
        }

        # Order keywords
        self.order_keywords = {
            'first': 1.0, 'most': 1.0, 'primary': 1.0, 'main': 1.0,
            'second': 0.8, 'then': 0.8, 'next': 0.8,
            'third': 0.6, 'also': 0.6, 'additionally': 0.6,
            'last': 0.4, 'least': 0.4, 'finally': 0.4
        }

    def preprocess_text(self, text: str) -> str:
        """
        Text preprocessing

        Args:
            text: Raw text

        Returns:
            Preprocessed text
        """
        if not text:
            return ""

        # Convert to lowercase
        text = text.lower().strip()

        # Remove punctuation (keep some meaningful ones)
        text = re.sub(r'[^\w\s\-\']', ' ', text)

        # Handle extra whitespace
        text = re.sub(r'\s+', ' ', text)

        return text

    def extract_breed_mentions(self, text: str) -> List[Tuple[str, float]]:
        """
        Extract mentioned breeds and their preference levels from text

        Args:
            text: Input text

        Returns:
            List of (breed_name, preference_score) tuples
        """
        text = self.preprocess_text(text)
        breed_mentions = []

        try:
            # Check each breed alias
            for alias, standard_breed in self.breed_aliases.items():
                if alias in text:
                    # Find surrounding preference indicators
                    preference_score = self._find_preference_score(text, alias)

                    if isinstance(standard_breed, list):
                        # If alias maps to multiple breeds, add all
                        for breed in standard_breed:
                            breed_mentions.append((breed, preference_score))
                    else:
                        breed_mentions.append((standard_breed, preference_score))

            # Deduplicate and merge scores
            breed_scores = {}
            for breed, score in breed_mentions:
                if breed in breed_scores:
                    breed_scores[breed] = max(breed_scores[breed], score)
                else:
                    breed_scores[breed] = score

            return list(breed_scores.items())

        except Exception as e:
            print(f"Error extracting breed mentions: {str(e)}")
            return []

    def _find_preference_score(self, text: str, breed_mention: str) -> float:
        """
        Find preference score near breed mention

        Args:
            text: Text
            breed_mention: Breed mention

        Returns:
            Preference score (0.0-1.0)
        """
        try:
            # Find breed mention position
            mention_pos = text.find(breed_mention)
            if mention_pos == -1:
                return 0.5  # Default neutral score

            # Check context (50 characters before and after)
            context_start = max(0, mention_pos - 50)
            context_end = min(len(text), mention_pos + len(breed_mention) + 50)
            context = text[context_start:context_end]

            # Find preference indicators
            max_score = 0.5  # Default score

            for indicator, score in self.preference_indicators.items():
                if indicator in context:
                    max_score = max(max_score, score)

            # Find order keywords
            for order_word, multiplier in self.order_keywords.items():
                if order_word in context:
                    max_score = max(max_score, max_score * multiplier)

            return max_score

        except Exception as e:
            print(f"Error finding preference score: {str(e)}")
            return 0.5

    def extract_lifestyle_preferences(self, text: str) -> Dict[str, Dict[str, float]]:
        """
        Extract lifestyle preferences from text

        Args:
            text: Input text

        Returns:
            Lifestyle preferences dictionary
        """
        text = self.preprocess_text(text)
        preferences = {}

        try:
            for category, keywords_dict in self.lifestyle_keywords.items():
                preferences[category] = {}

                for preference_type, keywords in keywords_dict.items():
                    score = 0.0
                    count = 0

                    for keyword in keywords:
                        if keyword in text:
                            # Calculate keyword occurrence intensity
                            keyword_count = text.count(keyword)
                            score += keyword_count
                            count += keyword_count

                    if count > 0:
                        # Normalize score
                        preferences[category][preference_type] = min(score / max(count, 1), 1.0)

            return preferences

        except Exception as e:
            print(f"Error extracting lifestyle preferences: {str(e)}")
            return {}

    def generate_search_keywords(self, text: str) -> List[str]:
        """
        Generate keyword list for search

        Args:
            text: Input text

        Returns:
            List of keywords
        """
        text = self.preprocess_text(text)
        keywords = []

        try:
            # Tokenize and filter stop words
            words = text.split()
            for word in words:
                if len(word) > 2 and word not in self.stop_words:
                    keywords.append(word)

            # Extract important phrases
            phrases = self._extract_phrases(text)
            keywords.extend(phrases)

            # Remove duplicates
            keywords = list(set(keywords))

            return keywords

        except Exception as e:
            print(f"Error generating search keywords: {str(e)}")
            return []

    def _extract_phrases(self, text: str) -> List[str]:
        """
        Extract important phrases

        Args:
            text: Input text

        Returns:
            List of phrases
        """
        phrases = []

        # Define important phrase patterns
        phrase_patterns = [
            r'good with \w+',
            r'apartment \w+',
            r'family \w+',
            r'exercise \w+',
            r'grooming \w+',
            r'noise \w+',
            r'training \w+',
            r'health \w+',
            r'\w+ friendly',
            r'\w+ tolerant',
            r'\w+ maintenance',
            r'\w+ energy',
            r'\w+ barking',
            r'\w+ shedding'
        ]

        for pattern in phrase_patterns:
            matches = re.findall(pattern, text)
            phrases.extend(matches)

        return phrases

    def analyze_sentiment(self, text: str) -> Dict[str, float]:
        """
        Analyze text sentiment

        Args:
            text: Input text

        Returns:
            Sentiment analysis results {'positive': 0.0-1.0, 'negative': 0.0-1.0, 'neutral': 0.0-1.0}
        """
        text = self.preprocess_text(text)

        positive_words = [
            'love', 'like', 'want', 'prefer', 'good', 'great', 'excellent',
            'perfect', 'ideal', 'wonderful', 'amazing', 'fantastic'
        ]

        negative_words = [
            'hate', 'dislike', 'bad', 'terrible', 'awful', 'horrible',
            'not good', 'don\'t want', 'avoid', 'against', 'problem'
        ]

        positive_count = sum(1 for word in positive_words if word in text)
        negative_count = sum(1 for word in negative_words if word in text)
        total_words = len(text.split())

        if total_words == 0:
            return {'positive': 0.0, 'negative': 0.0, 'neutral': 1.0}

        positive_ratio = positive_count / total_words
        negative_ratio = negative_count / total_words
        neutral_ratio = 1.0 - positive_ratio - negative_ratio

        return {
            'positive': positive_ratio,
            'negative': negative_ratio,
            'neutral': max(0.0, neutral_ratio)
        }

    def extract_implicit_preferences(self, text: str) -> Dict[str, Any]:
        """
        Extract implicit preferences from text

        Args:
            text: Input text

        Returns:
            Dictionary of implicit preferences
        """
        text = self.preprocess_text(text)
        implicit_prefs = {}

        try:
            # Infer preferences from mentioned activities
            if any(activity in text for activity in ['hiking', 'running', 'jogging', 'outdoor']):
                implicit_prefs['exercise_needs'] = 'high'
                implicit_prefs['size_preference'] = 'medium_to_large'

            # Infer from living environment
            if any(env in text for env in ['apartment', 'small space', 'city']):
                implicit_prefs['size_preference'] = 'small_to_medium'
                implicit_prefs['noise_tolerance'] = 'low'
                implicit_prefs['exercise_needs'] = 'moderate'

            # Infer from family situation
            if 'children' in text or 'kids' in text:
                implicit_prefs['temperament'] = 'gentle_patient'
                implicit_prefs['good_with_children'] = True

            # Infer from experience level
            if any(exp in text for exp in ['first time', 'beginner', 'new to']):
                implicit_prefs['care_level'] = 'low_to_moderate'
                implicit_prefs['training_difficulty'] = 'easy'

            # Infer from time commitment
            if any(time in text for time in ['busy', 'no time', 'low maintenance']):
                implicit_prefs['grooming_needs'] = 'low'
                implicit_prefs['care_level'] = 'low'
                implicit_prefs['exercise_needs'] = 'low_to_moderate'

            return implicit_prefs

        except Exception as e:
            print(f"Error extracting implicit preferences: {str(e)}")
            return {}

    def validate_input(self, text: str) -> Dict[str, Any]:
        """
        Validate input text validity

        Args:
            text: Input text

        Returns:
            Validation results dictionary
        """
        if not text or not text.strip():
            return {
                'is_valid': False,
                'error': 'Empty input',
                'suggestions': ['Please provide a description of your preferences']
            }

        text = text.strip()

        # Check length
        if len(text) < 10:
            return {
                'is_valid': False,
                'error': 'Input too short',
                'suggestions': ['Please provide more details about your preferences']
            }

        if len(text) > 1000:
            return {
                'is_valid': False,
                'error': 'Input too long',
                'suggestions': ['Please provide a more concise description']
            }

        # Check for meaningful content
        processed_text = self.preprocess_text(text)
        meaningful_words = [word for word in processed_text.split()
                          if len(word) > 2 and word not in self.stop_words]

        if len(meaningful_words) < 3:
            return {
                'is_valid': False,
                'error': 'Not enough meaningful content',
                'suggestions': ['Please provide more specific details about your lifestyle and preferences']
            }

        return {
            'is_valid': True,
            'word_count': len(meaningful_words),
            'suggestions': []
        }

def get_nlp_processor():
    """Get natural language processor instance"""
    try:
        return NaturalLanguageProcessor()
    except Exception as e:
        print(f"Error creating NLP processor: {str(e)}")
        return None