Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Ultra Supreme Analyzer - Complete Multi-Model Analysis | |
| Integrates multiple specialized models for comprehensive image analysis | |
| """ | |
| import re | |
| import logging | |
| import spaces | |
| import torch | |
| import cv2 | |
| import numpy as np | |
| from typing import Dict, List, Any, Tuple, Optional | |
| from PIL import Image | |
| # Deep learning models for specialized analysis | |
| try: | |
| from deepface import DeepFace | |
| DEEPFACE_AVAILABLE = True | |
| except: | |
| DEEPFACE_AVAILABLE = False | |
| try: | |
| import mediapipe as mp | |
| MEDIAPIPE_AVAILABLE = True | |
| except: | |
| MEDIAPIPE_AVAILABLE = False | |
| try: | |
| from transformers import pipeline | |
| TRANSFORMERS_AVAILABLE = True | |
| except: | |
| TRANSFORMERS_AVAILABLE = False | |
| from constants import ( | |
| FORBIDDEN_ELEMENTS, MICRO_AGE_INDICATORS, ULTRA_FACIAL_ANALYSIS, | |
| EMOTION_MICRO_EXPRESSIONS, CULTURAL_RELIGIOUS_ULTRA, CLOTHING_ACCESSORIES_ULTRA, | |
| ENVIRONMENTAL_ULTRA_ANALYSIS, POSE_BODY_LANGUAGE_ULTRA, COMPOSITION_PHOTOGRAPHY_ULTRA, | |
| TECHNICAL_PHOTOGRAPHY_ULTRA, QUALITY_DESCRIPTORS_ULTRA, GENDER_INDICATORS | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class UltraSupremeAnalyzer: | |
| """Complete analyzer with multiple specialized models""" | |
| def __init__(self): | |
| self.face_cascade = None | |
| self.pose_detector = None | |
| self.emotion_classifier = None | |
| self.scene_classifier = None | |
| self.models_initialized = False | |
| def _initialize_models(self): | |
| """Lazy initialization of models""" | |
| if self.models_initialized: | |
| return | |
| try: | |
| # OpenCV face detector (lightweight) | |
| self.face_cascade = cv2.CascadeClassifier( | |
| cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' | |
| ) | |
| # MediaPipe pose detector | |
| if MEDIAPIPE_AVAILABLE: | |
| self.mp_pose = mp.solutions.pose | |
| self.pose_detector = self.mp_pose.Pose( | |
| static_image_mode=True, | |
| min_detection_confidence=0.5 | |
| ) | |
| # Emotion classifier from transformers | |
| if TRANSFORMERS_AVAILABLE: | |
| self.emotion_classifier = pipeline( | |
| "image-classification", | |
| model="dima806/facial_emotions_image_detection" | |
| ) | |
| self.models_initialized = True | |
| logger.info("Additional analysis models initialized") | |
| except Exception as e: | |
| logger.error(f"Error initializing models: {e}") | |
| self.models_initialized = False | |
| def ultra_supreme_analysis(self, image: Any, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]: | |
| """Complete analysis using all available models""" | |
| # Initialize models if needed | |
| self._initialize_models() | |
| # Start with CLIP analysis | |
| clip_analysis = self._parse_clip_results(clip_fast, clip_classic, clip_best) | |
| # Convert image for processing | |
| if isinstance(image, Image.Image): | |
| img_array = np.array(image) | |
| img_rgb = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) | |
| else: | |
| img_rgb = image | |
| image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) | |
| # Initialize complete analysis structure | |
| analysis = { | |
| "clip_fast": clip_fast, | |
| "clip_classic": clip_classic, | |
| "clip_best": clip_best, | |
| "full_description": f"{clip_fast} {clip_classic} {clip_best}", | |
| "demographic": { | |
| "age_category": None, | |
| "age_confidence": 0, | |
| "gender": None, | |
| "gender_confidence": 0, | |
| "cultural_religious": [] | |
| }, | |
| "facial_ultra": { | |
| "eyes": [], | |
| "eyebrows": [], | |
| "nose": [], | |
| "mouth": [], | |
| "facial_hair": [], | |
| "skin": [], | |
| "structure": [], | |
| "face_count": 0, | |
| "face_locations": [] | |
| }, | |
| "emotional_state": { | |
| "primary_emotion": None, | |
| "emotion_confidence": 0, | |
| "emotion_distribution": {}, | |
| "micro_expressions": [], | |
| "overall_demeanor": [] | |
| }, | |
| "clothing_accessories": { | |
| "headwear": [], | |
| "eyewear": [], | |
| "clothing": [], | |
| "accessories": [], | |
| "style": [] | |
| }, | |
| "environmental": { | |
| "setting_type": None, | |
| "specific_location": None, | |
| "lighting_analysis": [], | |
| "atmosphere": [], | |
| "objects": [] | |
| }, | |
| "pose_composition": { | |
| "body_language": [], | |
| "head_position": [], | |
| "eye_contact": [], | |
| "posture": [], | |
| "gesture": [], | |
| "pose_confidence": 0 | |
| }, | |
| "technical_analysis": { | |
| "shot_type": None, | |
| "angle": None, | |
| "lighting_setup": None, | |
| "composition": [], | |
| "suggested_equipment": {} | |
| }, | |
| "intelligence_metrics": { | |
| "total_features_detected": 0, | |
| "analysis_depth_score": 0, | |
| "cultural_awareness_score": 0, | |
| "technical_optimization_score": 0, | |
| "model_confidence_average": 0 | |
| } | |
| } | |
| # Merge CLIP analysis | |
| analysis = self._merge_analysis(analysis, clip_analysis) | |
| # Face detection and analysis | |
| face_analysis = self._analyze_faces(img_rgb, image) | |
| analysis = self._merge_analysis(analysis, face_analysis) | |
| # Pose analysis | |
| if MEDIAPIPE_AVAILABLE: | |
| pose_analysis = self._analyze_pose(image) | |
| analysis = self._merge_analysis(analysis, pose_analysis) | |
| # Emotion analysis | |
| if TRANSFORMERS_AVAILABLE and analysis["facial_ultra"]["face_count"] > 0: | |
| emotion_analysis = self._analyze_emotions(image) | |
| analysis = self._merge_analysis(analysis, emotion_analysis) | |
| # Scene and environment analysis | |
| scene_analysis = self._analyze_scene(clip_analysis) | |
| analysis = self._merge_analysis(analysis, scene_analysis) | |
| # Calculate intelligence metrics | |
| analysis = self._calculate_intelligence_metrics(analysis) | |
| return analysis | |
| def _parse_clip_results(self, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]: | |
| """Parse CLIP results for structured information""" | |
| combined_text = f"{clip_fast} {clip_classic} {clip_best}".lower() | |
| analysis = { | |
| "demographic": {}, | |
| "facial_ultra": {}, | |
| "emotional_state": {}, | |
| "clothing_accessories": {}, | |
| "environmental": {}, | |
| "pose_composition": {}, | |
| "technical_analysis": {} | |
| } | |
| # Gender detection | |
| for gender, indicators in GENDER_INDICATORS.items(): | |
| if any(indicator in combined_text for indicator in indicators): | |
| analysis["demographic"]["gender"] = gender | |
| analysis["demographic"]["gender_confidence"] = 0.8 | |
| break | |
| # Age detection | |
| for age_category, indicators in MICRO_AGE_INDICATORS.items(): | |
| if any(indicator in combined_text for indicator in indicators): | |
| analysis["demographic"]["age_category"] = age_category | |
| analysis["demographic"]["age_confidence"] = 0.7 | |
| break | |
| # Facial features | |
| for feature_type, features in ULTRA_FACIAL_ANALYSIS.items(): | |
| if isinstance(features, dict): | |
| for sub_type, sub_features in features.items(): | |
| found = [f for f in sub_features if f in combined_text] | |
| if found and feature_type in analysis["facial_ultra"]: | |
| analysis["facial_ultra"][feature_type] = found | |
| else: | |
| found = [f for f in features if f in combined_text] | |
| if found: | |
| analysis["facial_ultra"][feature_type] = found | |
| # Emotions | |
| all_emotions = EMOTION_MICRO_EXPRESSIONS["primary_emotions"] + EMOTION_MICRO_EXPRESSIONS["complex_emotions"] | |
| found_emotions = [e for e in all_emotions if e in combined_text] | |
| if found_emotions: | |
| analysis["emotional_state"]["primary_emotion"] = found_emotions[0] | |
| analysis["emotional_state"]["micro_expressions"] = found_emotions | |
| # Environment | |
| for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["indoor_settings"].items(): | |
| if any(s in combined_text for s in settings): | |
| analysis["environmental"]["setting_type"] = f"indoor_{setting_type}" | |
| break | |
| for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["outdoor_settings"].items(): | |
| if any(s in combined_text for s in settings): | |
| analysis["environmental"]["setting_type"] = f"outdoor_{setting_type}" | |
| break | |
| # Technical analysis | |
| for shot_type in COMPOSITION_PHOTOGRAPHY_ULTRA["shot_types"]: | |
| if shot_type in combined_text: | |
| analysis["technical_analysis"]["shot_type"] = shot_type | |
| break | |
| return analysis | |
| def _analyze_faces(self, img_bgr: np.ndarray, img_pil: Image.Image) -> Dict[str, Any]: | |
| """Analyze faces using OpenCV and DeepFace""" | |
| analysis = {"facial_ultra": {}, "demographic": {}, "emotional_state": {}} | |
| # OpenCV face detection | |
| gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) | |
| faces = self.face_cascade.detectMultiScale(gray, 1.1, 4) | |
| analysis["facial_ultra"]["face_count"] = len(faces) | |
| analysis["facial_ultra"]["face_locations"] = faces.tolist() if len(faces) > 0 else [] | |
| # DeepFace analysis for the first detected face | |
| if DEEPFACE_AVAILABLE and len(faces) > 0: | |
| try: | |
| # Analyze with DeepFace | |
| results = DeepFace.analyze( | |
| img_path=np.array(img_pil), | |
| actions=['age', 'gender', 'emotion', 'race'], | |
| enforce_detection=False, | |
| silent=True | |
| ) | |
| if isinstance(results, list): | |
| results = results[0] | |
| # Extract demographics | |
| analysis["demographic"]["age_category"] = self._age_to_category(results.get('age', 0)) | |
| analysis["demographic"]["age_confidence"] = 0.85 | |
| analysis["demographic"]["gender"] = results.get('dominant_gender', '').lower() | |
| analysis["demographic"]["gender_confidence"] = results.get('gender', {}).get( | |
| results.get('dominant_gender', ''), 0 | |
| ) / 100.0 | |
| # Extract emotions | |
| emotions = results.get('emotion', {}) | |
| if emotions: | |
| sorted_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True) | |
| analysis["emotional_state"]["primary_emotion"] = sorted_emotions[0][0] | |
| analysis["emotional_state"]["emotion_confidence"] = sorted_emotions[0][1] / 100.0 | |
| analysis["emotional_state"]["emotion_distribution"] = { | |
| k: v/100.0 for k, v in emotions.items() | |
| } | |
| except Exception as e: | |
| logger.warning(f"DeepFace analysis failed: {e}") | |
| return analysis | |
| def _analyze_pose(self, image: Image.Image) -> Dict[str, Any]: | |
| """Analyze body pose using MediaPipe""" | |
| analysis = {"pose_composition": {}} | |
| if not MEDIAPIPE_AVAILABLE or not self.pose_detector: | |
| return analysis | |
| try: | |
| # Convert PIL to RGB array | |
| image_rgb = np.array(image) | |
| # Process the image | |
| results = self.pose_detector.process(image_rgb) | |
| if results.pose_landmarks: | |
| landmarks = results.pose_landmarks.landmark | |
| # Analyze head position | |
| nose = landmarks[self.mp_pose.PoseLandmark.NOSE] | |
| left_eye = landmarks[self.mp_pose.PoseLandmark.LEFT_EYE] | |
| right_eye = landmarks[self.mp_pose.PoseLandmark.RIGHT_EYE] | |
| # Calculate head tilt | |
| eye_diff_y = abs(left_eye.y - right_eye.y) | |
| if eye_diff_y > 0.02: | |
| analysis["pose_composition"]["head_position"] = ["head tilted"] | |
| else: | |
| analysis["pose_composition"]["head_position"] = ["head straight"] | |
| # Analyze posture | |
| left_shoulder = landmarks[self.mp_pose.PoseLandmark.LEFT_SHOULDER] | |
| right_shoulder = landmarks[self.mp_pose.PoseLandmark.RIGHT_SHOULDER] | |
| shoulder_diff_y = abs(left_shoulder.y - right_shoulder.y) | |
| if shoulder_diff_y < 0.02: | |
| analysis["pose_composition"]["posture"] = ["upright posture", "balanced stance"] | |
| else: | |
| analysis["pose_composition"]["posture"] = ["asymmetric posture"] | |
| # Confidence based on visibility | |
| visibility_scores = [l.visibility for l in landmarks] | |
| analysis["pose_composition"]["pose_confidence"] = np.mean(visibility_scores) | |
| # Body language interpretation | |
| if nose.y < 0.3: | |
| analysis["pose_composition"]["body_language"].append("confident stance") | |
| except Exception as e: | |
| logger.warning(f"Pose analysis failed: {e}") | |
| return analysis | |
| def _analyze_emotions(self, image: Image.Image) -> Dict[str, Any]: | |
| """Analyze emotions using transformer model""" | |
| analysis = {"emotional_state": {}} | |
| if not TRANSFORMERS_AVAILABLE or not self.emotion_classifier: | |
| return analysis | |
| try: | |
| # Run emotion classification | |
| predictions = self.emotion_classifier(image) | |
| if predictions: | |
| # Sort by confidence | |
| predictions.sort(key=lambda x: x['score'], reverse=True) | |
| # Primary emotion | |
| analysis["emotional_state"]["primary_emotion"] = predictions[0]['label'].lower() | |
| analysis["emotional_state"]["emotion_confidence"] = predictions[0]['score'] | |
| # Emotion distribution | |
| analysis["emotional_state"]["emotion_distribution"] = { | |
| pred['label'].lower(): pred['score'] for pred in predictions[:5] | |
| } | |
| # Map to micro-expressions | |
| primary = predictions[0]['label'].lower() | |
| if primary in ['happy', 'joy']: | |
| analysis["emotional_state"]["micro_expressions"] = ["smile", "positive expression"] | |
| elif primary in ['sad', 'sorrow']: | |
| analysis["emotional_state"]["micro_expressions"] = ["downturned mouth", "melancholic"] | |
| elif primary in ['angry', 'disgust']: | |
| analysis["emotional_state"]["micro_expressions"] = ["furrowed brow", "tense jaw"] | |
| elif primary in ['surprise', 'fear']: | |
| analysis["emotional_state"]["micro_expressions"] = ["raised eyebrows", "wide eyes"] | |
| except Exception as e: | |
| logger.warning(f"Emotion analysis failed: {e}") | |
| return analysis | |
| def _analyze_scene(self, clip_analysis: Dict[str, Any]) -> Dict[str, Any]: | |
| """Analyze scene and environment from CLIP results""" | |
| analysis = {"environmental": clip_analysis.get("environmental", {})} | |
| # Lighting analysis based on CLIP description | |
| combined_text = clip_analysis.get("full_description", "").lower() | |
| lighting_keywords = { | |
| "natural light": ["sunlight", "daylight", "outdoor", "sunny"], | |
| "artificial light": ["indoor", "lamp", "fluorescent", "led"], | |
| "dramatic lighting": ["dramatic", "moody", "contrast", "shadow"], | |
| "soft lighting": ["soft", "diffused", "gentle", "even"] | |
| } | |
| for light_type, keywords in lighting_keywords.items(): | |
| if any(keyword in combined_text for keyword in keywords): | |
| analysis["environmental"]["lighting_analysis"].append(light_type) | |
| # Atmosphere | |
| if any(word in combined_text for word in ["professional", "formal", "business"]): | |
| analysis["environmental"]["atmosphere"].append("professional") | |
| if any(word in combined_text for word in ["casual", "relaxed", "informal"]): | |
| analysis["environmental"]["atmosphere"].append("casual") | |
| if any(word in combined_text for word in ["artistic", "creative", "abstract"]): | |
| analysis["environmental"]["atmosphere"].append("artistic") | |
| return analysis | |
| def _age_to_category(self, age: int) -> str: | |
| """Convert numeric age to category""" | |
| if age < 2: | |
| return "infant" | |
| elif age < 12: | |
| return "child" | |
| elif age < 20: | |
| return "teen" | |
| elif age < 35: | |
| return "young_adult" | |
| elif age < 50: | |
| return "middle_aged" | |
| elif age < 65: | |
| return "senior" | |
| else: | |
| return "elderly" | |
| def _merge_analysis(self, base: Dict[str, Any], new: Dict[str, Any]) -> Dict[str, Any]: | |
| """Merge analysis results""" | |
| for key, value in new.items(): | |
| if key in base: | |
| if isinstance(value, dict) and isinstance(base[key], dict): | |
| base[key].update(value) | |
| elif isinstance(value, list) and isinstance(base[key], list): | |
| base[key].extend(value) | |
| elif value is not None and (not isinstance(base[key], (int, float)) or base[key] == 0): | |
| base[key] = value | |
| return base | |
| def _calculate_intelligence_metrics(self, analysis: Dict[str, Any]) -> Dict[str, Any]: | |
| """Calculate intelligence metrics based on analysis completeness""" | |
| metrics = analysis["intelligence_metrics"] | |
| # Count detected features | |
| total_features = 0 | |
| confidence_scores = [] | |
| # Demographic features | |
| if analysis["demographic"]["age_category"]: | |
| total_features += 1 | |
| confidence_scores.append(analysis["demographic"]["age_confidence"]) | |
| if analysis["demographic"]["gender"]: | |
| total_features += 1 | |
| confidence_scores.append(analysis["demographic"]["gender_confidence"]) | |
| # Facial features | |
| for feature in ["eyes", "eyebrows", "nose", "mouth", "facial_hair", "skin", "structure"]: | |
| if analysis["facial_ultra"].get(feature): | |
| total_features += len(analysis["facial_ultra"][feature]) | |
| # Emotional features | |
| if analysis["emotional_state"]["primary_emotion"]: | |
| total_features += 1 | |
| confidence_scores.append(analysis["emotional_state"]["emotion_confidence"]) | |
| # Pose features | |
| if analysis["pose_composition"].get("pose_confidence", 0) > 0: | |
| total_features += 1 | |
| confidence_scores.append(analysis["pose_composition"]["pose_confidence"]) | |
| # Environmental features | |
| if analysis["environmental"]["setting_type"]: | |
| total_features += 1 | |
| total_features += len(analysis["environmental"].get("lighting_analysis", [])) | |
| # Technical features | |
| if analysis["technical_analysis"]["shot_type"]: | |
| total_features += 1 | |
| # Calculate scores | |
| metrics["total_features_detected"] = total_features | |
| metrics["analysis_depth_score"] = min(100, total_features * 5) | |
| # Cultural awareness (if religious/cultural indicators found) | |
| if analysis["demographic"].get("cultural_religious"): | |
| metrics["cultural_awareness_score"] = 80 | |
| else: | |
| metrics["cultural_awareness_score"] = 40 | |
| # Technical optimization score | |
| tech_features = sum([ | |
| 1 if analysis["technical_analysis"]["shot_type"] else 0, | |
| len(analysis["environmental"].get("lighting_analysis", [])), | |
| len(analysis["pose_composition"].get("posture", [])) | |
| ]) | |
| metrics["technical_optimization_score"] = min(100, tech_features * 25) | |
| # Average confidence | |
| if confidence_scores: | |
| metrics["model_confidence_average"] = sum(confidence_scores) / len(confidence_scores) | |
| else: | |
| metrics["model_confidence_average"] = 0.5 | |
| return analysis | |
| def build_ultra_supreme_prompt(self, ultra_analysis: Dict[str, Any], clip_results: List[str]) -> str: | |
| """Build enhanced prompt based on comprehensive analysis""" | |
| prompt_parts = [] | |
| # Start with the best CLIP result | |
| if clip_results: | |
| prompt_parts.append(clip_results[0]) | |
| # Add demographic details if confident | |
| if ultra_analysis["demographic"]["age_category"] and ultra_analysis["demographic"]["age_confidence"] > 0.7: | |
| age_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_age"].get( | |
| ultra_analysis["demographic"]["age_category"], [] | |
| ) | |
| if age_descriptors: | |
| prompt_parts.append(age_descriptors[0]) | |
| # Add emotional context | |
| if ultra_analysis["emotional_state"]["primary_emotion"]: | |
| emotion = ultra_analysis["emotional_state"]["primary_emotion"] | |
| emotion_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_emotion"].get(emotion, []) | |
| if emotion_descriptors: | |
| prompt_parts.append(f"{emotion_descriptors[0]} expression") | |
| # Add technical details | |
| if ultra_analysis["technical_analysis"]["shot_type"]: | |
| prompt_parts.append(ultra_analysis["technical_analysis"]["shot_type"]) | |
| # Add lighting | |
| lighting = ultra_analysis["environmental"].get("lighting_analysis", []) | |
| if lighting: | |
| prompt_parts.append(f"with {lighting[0]}") | |
| # Combine parts | |
| enhanced_prompt = ", ".join(prompt_parts) | |
| # Clean up | |
| enhanced_prompt = re.sub(r'\s+', ' ', enhanced_prompt) | |
| enhanced_prompt = re.sub(r',\s*,+', ',', enhanced_prompt) | |
| return enhanced_prompt | |
| def calculate_ultra_supreme_score(self, prompt: str, ultra_analysis: Dict[str, Any]) -> Tuple[int, Dict[str, int]]: | |
| """Calculate comprehensive score based on multi-model analysis""" | |
| breakdown = {} | |
| # Base score from prompt quality | |
| breakdown["prompt_quality"] = min(25, len(prompt) // 10) | |
| # Analysis depth score | |
| breakdown["analysis_depth"] = min(25, ultra_analysis["intelligence_metrics"]["analysis_depth_score"] // 4) | |
| # Model confidence score | |
| avg_confidence = ultra_analysis["intelligence_metrics"]["model_confidence_average"] | |
| breakdown["model_confidence"] = int(avg_confidence * 25) | |
| # Feature richness score | |
| total_features = ultra_analysis["intelligence_metrics"]["total_features_detected"] | |
| breakdown["feature_richness"] = min(25, total_features * 2) | |
| total_score = sum(breakdown.values()) | |
| return total_score, breakdown |