Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Ultra Supreme Optimizer - Main optimization engine for image analysis | |
| VERSIÓN FLORENCE-2 - Usa Florence-2 en lugar de CLIP Interrogator | |
| """ | |
| # IMPORTANT: spaces must be imported BEFORE torch or any CUDA-using library | |
| import spaces | |
| import gc | |
| import logging | |
| import re | |
| from datetime import datetime | |
| from typing import Tuple, Dict, Any, Optional | |
| import torch | |
| import numpy as np | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModelForCausalLM | |
| from analyzer import UltraSupremeAnalyzer | |
| logger = logging.getLogger(__name__) | |
| class UltraSupremeOptimizer: | |
| """Main optimizer class for ultra supreme image analysis""" | |
| def __init__(self): | |
| self.processor = None | |
| self.model = None | |
| self.analyzer = UltraSupremeAnalyzer() | |
| self.usage_count = 0 | |
| self.device = self._get_device() | |
| self.is_initialized = False | |
| def _get_device() -> str: | |
| """Determine the best available device for computation""" | |
| if torch.cuda.is_available(): | |
| return "cuda" | |
| elif torch.backends.mps.is_available(): | |
| return "mps" | |
| else: | |
| return "cpu" | |
| def initialize_model(self) -> bool: | |
| """Initialize Florence-2 model""" | |
| if self.is_initialized: | |
| return True | |
| try: | |
| logger.info("Loading Florence-2 model...") | |
| # Load Florence-2 base model (you can also use 'microsoft/Florence-2-large' for better quality) | |
| model_id = "microsoft/Florence-2-base" | |
| self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if self.device == "cuda" else torch.float32 | |
| ) | |
| # Keep model on CPU initially | |
| self.model = self.model.to("cpu") | |
| self.model.eval() | |
| self.is_initialized = True | |
| # Clean up memory after initialization | |
| gc.collect() | |
| logger.info("Florence-2 model initialized successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Model initialization error: {e}") | |
| return False | |
| def optimize_image(self, image: Any) -> Optional[Image.Image]: | |
| """Optimize image for processing""" | |
| if image is None: | |
| return None | |
| try: | |
| # Convert to PIL Image if necessary | |
| if isinstance(image, np.ndarray): | |
| image = Image.fromarray(image) | |
| elif not isinstance(image, Image.Image): | |
| image = Image.open(image) | |
| # Convert to RGB if necessary | |
| if image.mode != 'RGB': | |
| image = image.convert('RGB') | |
| # Florence-2 handles various sizes well, but let's be reasonable | |
| max_size = 1024 | |
| if image.size[0] > max_size or image.size[1] > max_size: | |
| image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) | |
| return image | |
| except Exception as e: | |
| logger.error(f"Image optimization error: {e}") | |
| return None | |
| def apply_flux_rules(self, base_prompt: str) -> str: | |
| """Aplica las reglas de Flux a un prompt base""" | |
| # Limpiar el prompt de elementos no deseados | |
| cleanup_patterns = [ | |
| r',\s*trending on artstation', | |
| r',\s*trending on [^,]+', | |
| r',\s*\d+k\s*', | |
| r',\s*\d+k resolution', | |
| r',\s*artstation', | |
| r',\s*concept art', | |
| r',\s*digital art', | |
| r',\s*by greg rutkowski', | |
| ] | |
| cleaned_prompt = base_prompt | |
| for pattern in cleanup_patterns: | |
| cleaned_prompt = re.sub(pattern, '', cleaned_prompt, flags=re.IGNORECASE) | |
| # Detectar el tipo de imagen para añadir configuración de cámara apropiada | |
| camera_config = "" | |
| if any(word in base_prompt.lower() for word in ['portrait', 'person', 'man', 'woman', 'face']): | |
| camera_config = ", Shot on Hasselblad X2D 100C, 90mm f/2.5 lens at f/2.8, professional portrait photography" | |
| elif any(word in base_prompt.lower() for word in ['landscape', 'mountain', 'nature', 'outdoor']): | |
| camera_config = ", Shot on Phase One XT, 40mm f/4 lens at f/8, epic landscape photography" | |
| elif any(word in base_prompt.lower() for word in ['street', 'urban', 'city']): | |
| camera_config = ", Shot on Leica M11, 35mm f/1.4 lens at f/2.8, documentary street photography" | |
| else: | |
| camera_config = ", Shot on Phase One XF IQ4, 80mm f/2.8 lens at f/4, professional photography" | |
| # Añadir mejoras de iluminación si no están presentes | |
| if 'lighting' not in cleaned_prompt.lower(): | |
| if 'dramatic' in cleaned_prompt.lower(): | |
| cleaned_prompt += ", dramatic cinematic lighting" | |
| elif 'portrait' in cleaned_prompt.lower(): | |
| cleaned_prompt += ", professional studio lighting with subtle rim light" | |
| else: | |
| cleaned_prompt += ", masterful natural lighting" | |
| # Construir el prompt final | |
| final_prompt = cleaned_prompt + camera_config | |
| # Asegurar que empiece con mayúscula | |
| final_prompt = final_prompt[0].upper() + final_prompt[1:] if final_prompt else final_prompt | |
| # Limpiar espacios y comas duplicadas | |
| final_prompt = re.sub(r'\s+', ' ', final_prompt) | |
| final_prompt = re.sub(r',\s*,+', ',', final_prompt) | |
| return final_prompt | |
| def run_florence_inference(self, image: Image.Image) -> Tuple[str, str, str]: | |
| """Run Florence-2 inference on GPU""" | |
| try: | |
| # Move model to GPU | |
| self.model = self.model.to("cuda") | |
| logger.info("Florence-2 model moved to GPU") | |
| # Task prompts for different types of analysis | |
| tasks = { | |
| "detailed_caption": "<DETAILED_CAPTION>", | |
| "more_detailed_caption": "<MORE_DETAILED_CAPTION>", | |
| "caption": "<CAPTION>", | |
| "dense_region_caption": "<DENSE_REGION_CAPTION>" | |
| } | |
| results = {} | |
| # Run different captioning tasks | |
| for task_name, task_prompt in tasks.items(): | |
| try: | |
| inputs = self.processor(text=task_prompt, images=image, return_tensors="pt") | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| with torch.cuda.amp.autocast(dtype=torch.float16): | |
| generated_ids = self.model.generate( | |
| input_ids=inputs["input_ids"], | |
| pixel_values=inputs["pixel_values"], | |
| max_new_tokens=1024, | |
| num_beams=3, | |
| do_sample=False | |
| ) | |
| generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
| parsed = self.processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height)) | |
| # Extract the caption from the parsed result | |
| if task_prompt in parsed: | |
| results[task_name] = parsed[task_prompt] | |
| else: | |
| # Sometimes the result is directly in the parsed output | |
| results[task_name] = str(parsed) if parsed else "" | |
| except Exception as e: | |
| logger.warning(f"Error in {task_name}: {e}") | |
| results[task_name] = "" | |
| # Extract results | |
| detailed_caption = results.get("detailed_caption", "") | |
| more_detailed = results.get("more_detailed_caption", "") | |
| caption = results.get("caption", "") | |
| # Combine for a comprehensive description | |
| if more_detailed: | |
| full_prompt = more_detailed | |
| elif detailed_caption: | |
| full_prompt = detailed_caption | |
| else: | |
| full_prompt = caption | |
| # Use different levels as our three outputs | |
| clip_fast = caption if caption else "A photograph" | |
| clip_classic = detailed_caption if detailed_caption else full_prompt | |
| clip_best = more_detailed if more_detailed else full_prompt | |
| logger.info(f"Florence-2 captions generated successfully") | |
| return full_prompt, clip_fast, clip_classic | |
| except Exception as e: | |
| logger.error(f"Florence-2 inference error: {e}") | |
| # Move model back to CPU to free GPU memory | |
| self.model = self.model.to("cpu") | |
| raise e | |
| finally: | |
| # Always move model back to CPU after inference | |
| self.model = self.model.to("cpu") | |
| torch.cuda.empty_cache() | |
| def generate_ultra_supreme_prompt(self, image: Any) -> Tuple[str, str, int, Dict[str, int]]: | |
| """ | |
| Generate ultra supreme prompt from image usando Florence-2 | |
| Returns: | |
| Tuple of (prompt, analysis_info, score, breakdown) | |
| """ | |
| try: | |
| # Inicializar modelo si no está inicializado | |
| if not self.is_initialized: | |
| if not self.initialize_model(): | |
| return "❌ Model initialization failed.", "Please refresh and try again.", 0, {} | |
| # Validate input | |
| if image is None: | |
| return "❌ Please upload an image.", "No image provided.", 0, {} | |
| self.usage_count += 1 | |
| # Optimize image | |
| image = self.optimize_image(image) | |
| if image is None: | |
| return "❌ Image processing failed.", "Invalid image format.", 0, {} | |
| start_time = datetime.now() | |
| logger.info("ULTRA SUPREME ANALYSIS - Starting with Florence-2") | |
| # Ejecutar inferencia Florence-2 | |
| try: | |
| full_prompt, caption_fast, caption_detailed = self.run_florence_inference(image) | |
| except Exception as e: | |
| logger.error(f"Florence-2 failed: {e}") | |
| # Fallback básico | |
| full_prompt = "A photograph" | |
| caption_fast = "image" | |
| caption_detailed = "detailed image" | |
| logger.info(f"Florence-2 caption: {full_prompt[:100]}...") | |
| # Ejecutar análisis ultra supremo con múltiples modelos | |
| logger.info("Running multi-model ultra supreme analysis...") | |
| ultra_analysis = self.analyzer.ultra_supreme_analysis( | |
| image, caption_fast, caption_detailed, full_prompt | |
| ) | |
| # Construir prompt mejorado basado en análisis completo | |
| enhanced_prompt_parts = [] | |
| # Base prompt de Florence | |
| enhanced_prompt_parts.append(full_prompt) | |
| # Agregar información demográfica si está disponible | |
| if ultra_analysis["demographic"]["gender"] and ultra_analysis["demographic"]["gender_confidence"] > 0.7: | |
| gender = ultra_analysis["demographic"]["gender"] | |
| age_cat = ultra_analysis["demographic"]["age_category"] | |
| if age_cat: | |
| enhanced_prompt_parts.append(f"{age_cat} {gender}") | |
| # Agregar estado emocional principal | |
| if ultra_analysis["emotional_state"]["primary_emotion"] and ultra_analysis["emotional_state"]["emotion_confidence"] > 0.6: | |
| emotion = ultra_analysis["emotional_state"]["primary_emotion"] | |
| enhanced_prompt_parts.append(f"{emotion} expression") | |
| # Agregar información de pose si está disponible | |
| if ultra_analysis["pose_composition"]["posture"]: | |
| enhanced_prompt_parts.append(ultra_analysis["pose_composition"]["posture"][0]) | |
| # Combinar y aplicar reglas de Flux | |
| combined_prompt = ", ".join(enhanced_prompt_parts) | |
| optimized_prompt = self.apply_flux_rules(combined_prompt) | |
| # Si el analyzer enriqueció el prompt, úsalo | |
| analyzer_prompt = self.analyzer.build_ultra_supreme_prompt(ultra_analysis, [full_prompt]) | |
| if len(analyzer_prompt) > len(optimized_prompt): | |
| optimized_prompt = self.apply_flux_rules(analyzer_prompt) | |
| # Calcular score usando el analyzer | |
| score, breakdown = self.analyzer.calculate_ultra_supreme_score(optimized_prompt, ultra_analysis) | |
| end_time = datetime.now() | |
| duration = (end_time - start_time).total_seconds() | |
| # Memory cleanup | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # Generate enhanced analysis report con datos de múltiples modelos | |
| analysis_info = self._generate_ultra_analysis_report( | |
| ultra_analysis, score, breakdown, duration, "Florence-2" | |
| ) | |
| return optimized_prompt, analysis_info, score, breakdown | |
| except Exception as e: | |
| logger.error(f"Ultra supreme generation error: {e}", exc_info=True) | |
| return f"❌ Error: {str(e)}", "Please try with a different image.", 0, {} | |
| def _generate_ultra_analysis_report(self, analysis: Dict[str, Any], | |
| score: int, breakdown: Dict[str, int], | |
| duration: float, caption_model: str = "Florence-2") -> str: | |
| """Generate ultra detailed analysis report with multi-model results""" | |
| device_used = "cuda" if torch.cuda.is_available() else "cpu" | |
| gpu_status = "⚡ ZeroGPU" if device_used == "cuda" else "💻 CPU" | |
| # Demographic info | |
| demo_info = "" | |
| if analysis["demographic"]["age_category"]: | |
| age = analysis["demographic"]["age_category"].replace("_", " ").title() | |
| gender = analysis["demographic"]["gender"] or "person" | |
| confidence = analysis["demographic"]["age_confidence"] | |
| demo_info = f"**Detected:** {age} {gender} (confidence: {confidence:.0%})" | |
| # Emotion info | |
| emotion_info = "" | |
| if analysis["emotional_state"]["primary_emotion"]: | |
| emotion = analysis["emotional_state"]["primary_emotion"] | |
| confidence = analysis["emotional_state"]["emotion_confidence"] | |
| emotion_info = f"**Primary Emotion:** {emotion} ({confidence:.0%})" | |
| # Add emotion distribution if available | |
| if analysis["emotional_state"]["emotion_distribution"]: | |
| top_emotions = sorted( | |
| analysis["emotional_state"]["emotion_distribution"].items(), | |
| key=lambda x: x[1], reverse=True | |
| )[:3] | |
| emotion_details = ", ".join([f"{e[0]}: {e[1]:.0%}" for e in top_emotions]) | |
| emotion_info += f"\n**Emotion Distribution:** {emotion_details}" | |
| # Face analysis info | |
| face_info = f"**Faces Detected:** {analysis['facial_ultra']['face_count']}" | |
| if analysis['facial_ultra']['face_count'] > 0: | |
| features = [] | |
| for feature_type in ['eyes', 'mouth', 'facial_hair', 'skin']: | |
| if analysis['facial_ultra'].get(feature_type): | |
| features.extend(analysis['facial_ultra'][feature_type]) | |
| if features: | |
| face_info += f"\n**Facial Features:** {', '.join(features[:5])}" | |
| # Pose info | |
| pose_info = "" | |
| if analysis["pose_composition"].get("pose_confidence", 0) > 0: | |
| confidence = analysis["pose_composition"]["pose_confidence"] | |
| pose_info = f"**Pose Analysis:** Body detected ({confidence:.0%} confidence)" | |
| if analysis["pose_composition"]["posture"]: | |
| pose_info += f"\n**Posture:** {', '.join(analysis['pose_composition']['posture'])}" | |
| # Environment info | |
| env_info = "" | |
| if analysis["environmental"]["setting_type"]: | |
| env_info = f"**Setting:** {analysis['environmental']['setting_type'].replace('_', ' ').title()}" | |
| if analysis["environmental"]["lighting_analysis"]: | |
| env_info += f"\n**Lighting:** {', '.join(analysis['environmental']['lighting_analysis'])}" | |
| # Intelligence metrics | |
| metrics = analysis["intelligence_metrics"] | |
| # Caption info | |
| caption_info = analysis.get("clip_best", "")[:150] + "..." if len(analysis.get("clip_best", "")) > 150 else analysis.get("clip_best", "") | |
| analysis_info = f"""**🚀 ULTRA SUPREME MULTI-MODEL ANALYSIS COMPLETE** | |
| **Processing:** {gpu_status} • {duration:.1f}s • {caption_model} + Multi-Model Pipeline | |
| **Ultra Score:** {score}/100 • Models: {caption_model} + DeepFace + MediaPipe + Transformers | |
| **📊 BREAKDOWN:** | |
| • Prompt Quality: {breakdown.get('prompt_quality', 0)}/25 | |
| • Analysis Depth: {breakdown.get('analysis_depth', 0)}/25 | |
| • Model Confidence: {breakdown.get('model_confidence', 0)}/25 | |
| • Feature Richness: {breakdown.get('feature_richness', 0)}/25 | |
| **📝 VISION-LANGUAGE ANALYSIS:** | |
| **{caption_model} Caption:** {caption_info} | |
| **🧠 DEEP ANALYSIS RESULTS:** | |
| **👤 DEMOGRAPHICS & IDENTITY:** | |
| {demo_info or "No face detected for demographic analysis"} | |
| **😊 EMOTIONAL ANALYSIS:** | |
| {emotion_info or "No emotional data available"} | |
| **👁️ FACIAL ANALYSIS:** | |
| {face_info} | |
| **🚶 POSE & BODY LANGUAGE:** | |
| {pose_info or "No pose data available"} | |
| **🏞️ ENVIRONMENT & SCENE:** | |
| {env_info or "No environmental data detected"} | |
| **📊 INTELLIGENCE METRICS:** | |
| • **Total Features Detected:** {metrics['total_features_detected']} | |
| • **Analysis Depth Score:** {metrics['analysis_depth_score']}/100 | |
| • **Model Confidence Average:** {metrics['model_confidence_average']:.0%} | |
| • **Technical Optimization:** {metrics['technical_optimization_score']}/100 | |
| **✨ MULTI-MODEL ADVANTAGES:** | |
| ✅ {caption_model}: State-of-the-art vision-language understanding | |
| ✅ DeepFace: Accurate age, gender, emotion detection | |
| ✅ MediaPipe: Body pose and gesture analysis | |
| ✅ Transformers: Advanced emotion classification | |
| ✅ OpenCV: Robust face detection | |
| **🔬 Powered by Pariente AI Research • Ultra Supreme Intelligence Engine**""" | |
| return analysis_info |