Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	| import os | |
| import re | |
| import json | |
| import logging | |
| import random | |
| import numpy as np | |
| from typing import Dict, List, Tuple, Any, Optional | |
| from scene_type import SCENE_TYPES | |
| from scene_detail_templates import SCENE_DETAIL_TEMPLATES | |
| from object_template_fillers import OBJECT_TEMPLATE_FILLERS | |
| from lighting_conditions import LIGHTING_CONDITIONS | |
| from viewpoint_templates import VIEWPOINT_TEMPLATES | |
| from cultural_templates import CULTURAL_TEMPLATES | |
| from confifence_templates import CONFIDENCE_TEMPLATES | |
| from landmark_data import ALL_LANDMARKS | |
| class EnhancedSceneDescriber: | |
| """ | |
| Enhanced scene description generator with improved template handling, | |
| viewpoint awareness, and cultural context recognition. | |
| Provides detailed natural language descriptions of scenes based on | |
| detection results and scene classification. | |
| """ | |
| def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None, spatial_analyzer_instance: Optional[Any] = None): | |
| """ | |
| Initialize the enhanced scene describer. | |
| Args: | |
| templates_db: Optional custom templates database | |
| scene_types: Dictionary of scene type definitions | |
| """ | |
| self.logger = logging.getLogger(self.__class__.__name__) # Use class name for logger | |
| self.logger.setLevel(logging.INFO) # Or your desired logging level | |
| # Optional: Add a handler if not configured globally | |
| if not self.logger.hasHandlers(): | |
| handler = logging.StreamHandler() | |
| formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| handler.setFormatter(formatter) | |
| self.logger.addHandler(handler) | |
| # Load or use provided scene types | |
| self.scene_types = scene_types or self._load_default_scene_types() | |
| # Load templates database | |
| self.templates = templates_db or self._load_templates() | |
| # Initialize viewpoint detection parameters | |
| self._initialize_viewpoint_parameters() | |
| def _load_default_scene_types(self) -> Dict: | |
| """ | |
| Load default scene types. | |
| Returns: | |
| Dict: Scene type definitions | |
| """ | |
| return SCENE_TYPES | |
| def _load_templates(self) -> Dict: | |
| """ | |
| Load description templates from imported Python modules. | |
| Returns: | |
| Dict: Template collections for different description components | |
| """ | |
| templates = {} | |
| # 載入事先準備的模板 | |
| templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES | |
| templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS | |
| templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES | |
| templates["cultural_templates"] = CULTURAL_TEMPLATES | |
| # 從 LIGHTING_CONDITIONS 獲取照明模板 | |
| templates["lighting_templates"] = { | |
| key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items() | |
| } | |
| # 設置默認的置信度模板 | |
| templates["confidence_templates"] = { | |
| "high": "{description} {details}", | |
| "medium": "This appears to be {description} {details}", | |
| "low": "This might be {description}, but the confidence is low. {details}" | |
| } | |
| # 初始化其他必要的模板(現在這個函數簡化了很多) | |
| self._initialize_default_templates(templates) | |
| return templates | |
| def _initialize_default_templates(self, templates: Dict): | |
| """ | |
| 檢查模板字典並填充任何缺失的默認模板。 | |
| 在將模板移至專門的模組後,此方法主要作為安全機制, | |
| 確保即使導入失敗或某些模板未在外部定義,系統仍能正常運行。 | |
| Args: | |
| templates: 要檢查和更新的模板字典 | |
| """ | |
| # 檢查關鍵模板類型是否存在,如果不存在則添加默認值 | |
| # 置信度模板 - 用於控制描述的語氣 | |
| if "confidence_templates" not in templates: | |
| templates["confidence_templates"] = { | |
| "high": "{description} {details}", | |
| "medium": "This appears to be {description} {details}", | |
| "low": "This might be {description}, but the confidence is low. {details}" | |
| } | |
| # 場景細節模板 | |
| if "scene_detail_templates" not in templates: | |
| templates["scene_detail_templates"] = { | |
| "default": ["A space with various objects."] | |
| } | |
| # 物體填充模板,用於生成物體描述 | |
| if "object_template_fillers" not in templates: | |
| templates["object_template_fillers"] = { | |
| "default": ["various items"] | |
| } | |
| # 視角模板,雖然現在從專門模組導入,但可作為備份 | |
| if "viewpoint_templates" not in templates: | |
| # 使用簡化版的默認視角模板 | |
| templates["viewpoint_templates"] = { | |
| "eye_level": { | |
| "prefix": "From eye level, ", | |
| "observation": "the scene is viewed straight on." | |
| }, | |
| "aerial": { | |
| "prefix": "From above, ", | |
| "observation": "the scene is viewed from a bird's-eye perspective." | |
| } | |
| } | |
| # 文化模板 | |
| if "cultural_templates" not in templates: | |
| templates["cultural_templates"] = { | |
| "asian": { | |
| "elements": ["cultural elements"], | |
| "description": "The scene has Asian characteristics." | |
| }, | |
| "european": { | |
| "elements": ["architectural features"], | |
| "description": "The scene has European characteristics." | |
| } | |
| } | |
| # 照明模板 - 用於描述光照條件 | |
| if "lighting_templates" not in templates: | |
| templates["lighting_templates"] = { | |
| "day_clear": "The scene is captured during daylight.", | |
| "night": "The scene is captured at night.", | |
| "unknown": "The lighting conditions are not easily determined." | |
| } | |
| def _initialize_viewpoint_parameters(self): | |
| """ | |
| Initialize parameters used for viewpoint detection. | |
| """ | |
| self.viewpoint_params = { | |
| # Parameters for detecting aerial views | |
| "aerial_threshold": 0.7, # High object density viewed from top | |
| "aerial_size_variance_threshold": 0.15, # Low size variance in aerial views | |
| # Parameters for detecting low angle views | |
| "low_angle_threshold": 0.3, # Bottom-heavy object distribution | |
| "vertical_size_ratio_threshold": 1.8, # Vertical objects appear taller | |
| # Parameters for detecting elevated views | |
| "elevated_threshold": 0.6, # Objects mostly in middle/bottom | |
| "elevated_top_threshold": 0.3 # Few objects at top of frame | |
| } | |
| def _generate_landmark_description(self, | |
| scene_type: str, | |
| detected_objects: List[Dict], | |
| confidence: float, | |
| lighting_info: Optional[Dict] = None, | |
| functional_zones: Optional[Dict] = None, | |
| landmark_objects: Optional[List[Dict]] = None) -> str: | |
| """ | |
| 生成包含地標信息的場景描述 | |
| Args: | |
| scene_type: 識別的場景類型 | |
| detected_objects: 檢測到的物體列表 | |
| confidence: 場景分類置信度 | |
| lighting_info: 照明條件信息(可選) | |
| functional_zones: 功能區域信息(可選) | |
| landmark_objects: 識別為地標的物體列表(可選) | |
| Returns: | |
| str: 包含地標信息的自然語言場景描述 | |
| """ | |
| # 如果沒有提供地標物體,則從檢測物體中篩選 | |
| if landmark_objects is None: | |
| landmark_objects = [obj for obj in detected_objects if obj.get("is_landmark", False)] | |
| # 如果沒有地標,退回到標準描述 | |
| if not landmark_objects: | |
| if scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"]: | |
| # 場景類型是地標但沒有具體地標物體 | |
| base_description = "A scenic area that appears to be a tourist destination, though specific landmarks are not clearly identifiable." | |
| else: | |
| # 使用標準方法生成基本描述 | |
| return self._format_final_description(self._generate_scene_details( | |
| scene_type, | |
| detected_objects, | |
| lighting_info, | |
| self._detect_viewpoint(detected_objects) | |
| )) | |
| else: | |
| # 獲取主要地標(信心度最高的) | |
| primary_landmark = max(landmark_objects, key=lambda x: x.get("confidence", 0)) | |
| landmark_name = primary_landmark.get("class_name", "landmark") | |
| landmark_location = primary_landmark.get("location", "") | |
| # 根據地標類型選擇適當的描述模板 | |
| if scene_type == "natural_landmark" or primary_landmark.get("landmark_type") == "natural": | |
| base_description = f"A natural landmark scene featuring {landmark_name} in {landmark_location}." | |
| elif scene_type == "historical_monument" or primary_landmark.get("landmark_type") == "monument": | |
| base_description = f"A historical monument scene showcasing {landmark_name}, a significant landmark in {landmark_location}." | |
| else: | |
| base_description = f"A tourist landmark scene centered around {landmark_name}, an iconic structure in {landmark_location}." | |
| # 加地標的額外信息 | |
| landmark_details = [] | |
| for landmark in landmark_objects: | |
| details = [] | |
| # 加建造年份 | |
| if "year_built" in landmark: | |
| details.append(f"built in {landmark['year_built']}") | |
| # 加建築風格 | |
| if "architectural_style" in landmark: | |
| details.append(f"featuring {landmark['architectural_style']} architectural style") | |
| # 加重要性 | |
| if "significance" in landmark: | |
| details.append(landmark["significance"]) | |
| # 如果有詳細信息,加到描述中 | |
| if details: | |
| landmark_details.append(f"{landmark['class_name']} ({', '.join(details)})") | |
| # 將詳細信息添加到基本描述中 | |
| if landmark_details: | |
| description = base_description + " " + "The scene features " + ", ".join(landmark_details) + "." | |
| else: | |
| description = base_description | |
| # 獲取視角 | |
| viewpoint = self._detect_viewpoint(detected_objects) | |
| # 生成人員活動描述 | |
| people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) # 人的類別ID通常為0 | |
| if people_count > 0: | |
| if people_count == 1: | |
| people_description = "There is one person in the scene, likely a tourist or visitor." | |
| elif people_count < 5: | |
| people_description = f"There are {people_count} people in the scene, possibly tourists visiting the landmark." | |
| else: | |
| people_description = f"The scene includes a group of {people_count} people, indicating this is a popular tourist destination." | |
| description = self._smart_append(description, people_description) | |
| # 添加照明信息 | |
| if lighting_info and "time_of_day" in lighting_info: | |
| lighting_type = lighting_info["time_of_day"] | |
| if lighting_type in self.templates.get("lighting_templates", {}): | |
| lighting_description = self.templates["lighting_templates"][lighting_type] | |
| description = self._smart_append(description, lighting_description) | |
| # 添加視角描述 | |
| if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}): | |
| viewpoint_template = self.templates["viewpoint_templates"][viewpoint] | |
| # 添加視角前綴 | |
| prefix = viewpoint_template.get('prefix', '') | |
| if prefix and not description.startswith(prefix): | |
| # 保持句子流暢性 | |
| if description and description[0].isupper(): | |
| description = prefix + description[0].lower() + description[1:] | |
| else: | |
| description = prefix + description | |
| # 添加視角觀察描述 | |
| viewpoint_desc = viewpoint_template.get("observation", "").format( | |
| scene_elements="the landmark and surrounding area" | |
| ) | |
| if viewpoint_desc and viewpoint_desc not in description: | |
| description = self._smart_append(description, viewpoint_desc) | |
| # 添加功能區域描述 | |
| if functional_zones and len(functional_zones) > 0: | |
| zones_desc = self._describe_functional_zones(functional_zones) | |
| if zones_desc: | |
| description = self._smart_append(description, zones_desc) | |
| # 描述可能的活動 | |
| landmark_activities = [] | |
| # 根據地標類型生成通用活動 | |
| if scene_type == "natural_landmark" or any(obj.get("landmark_type") == "natural" for obj in landmark_objects): | |
| landmark_activities = [ | |
| "nature photography", | |
| "scenic viewing", | |
| "hiking or walking", | |
| "guided nature tours", | |
| "outdoor appreciation" | |
| ] | |
| elif scene_type == "historical_monument" or any(obj.get("landmark_type") == "monument" for obj in landmark_objects): | |
| landmark_activities = [ | |
| "historical sightseeing", | |
| "educational tours", | |
| "cultural appreciation", | |
| "photography of historical architecture", | |
| "learning about historical significance" | |
| ] | |
| else: | |
| landmark_activities = [ | |
| "sightseeing", | |
| "taking photographs", | |
| "guided tours", | |
| "cultural tourism", | |
| "souvenir shopping" | |
| ] | |
| # 添加活動描述 | |
| if landmark_activities: | |
| activities_text = "Common activities at this location include " + ", ".join(landmark_activities[:3]) + "." | |
| description = self._smart_append(description, activities_text) | |
| # 最後格式化描述 | |
| return self._format_final_description(description) | |
| def filter_landmark_references(self, text, enable_landmark=True): | |
| """ | |
| 動態過濾文本中的地標引用 | |
| Args: | |
| text: 需要過濾的文本 | |
| enable_landmark: 是否啟用地標功能 | |
| Returns: | |
| str: 過濾後的文本 | |
| """ | |
| if enable_landmark or not text: | |
| return text | |
| try: | |
| # 動態收集所有地標名稱和位置 | |
| landmark_names = [] | |
| locations = [] | |
| for landmark_id, info in ALL_LANDMARKS.items(): | |
| # 收集地標名稱及其別名 | |
| landmark_names.append(info["name"]) | |
| landmark_names.extend(info.get("aliases", [])) | |
| # 收集地理位置 | |
| if "location" in info: | |
| location = info["location"] | |
| locations.append(location) | |
| # 處理分離的城市和國家名稱 | |
| parts = location.split(",") | |
| if len(parts) >= 1: | |
| locations.append(parts[0].strip()) | |
| if len(parts) >= 2: | |
| locations.append(parts[1].strip()) | |
| # 使用正則表達式動態替換所有地標名稱 | |
| import re | |
| for name in landmark_names: | |
| if name and len(name) > 2: # 避免過短的名稱 | |
| text = re.sub(r'\b' + re.escape(name) + r'\b', "tall structure", text, flags=re.IGNORECASE) | |
| # 動態替換所有位置引用 | |
| for location in locations: | |
| if location and len(location) > 2: | |
| # 替換常見位置表述模式 | |
| text = re.sub(r'in ' + re.escape(location), "in the urban area", text, flags=re.IGNORECASE) | |
| text = re.sub(r'of ' + re.escape(location), "of the urban area", text, flags=re.IGNORECASE) | |
| text = re.sub(r'\b' + re.escape(location) + r'\b', "the urban area", text, flags=re.IGNORECASE) | |
| except ImportError: | |
| # 如果無法導入,使用基本模式 | |
| pass | |
| # 通用地標描述模式替換 | |
| landmark_patterns = [ | |
| (r'a (tourist|popular|famous) landmark', r'an urban structure'), | |
| (r'an iconic structure in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'), | |
| (r'a famous (monument|tower|landmark) in ([A-Z][a-zA-Z\s,]+)', r'an urban structure in the area'), | |
| (r'(centered|built|located|positioned) around the ([A-Z][a-zA-Z\s]+? (Tower|Monument|Landmark))', r'located in this area'), | |
| (r'(sightseeing|guided tours|cultural tourism) (at|around|near) (this landmark|the [A-Z][a-zA-Z\s]+)', r'\1 in this area'), | |
| (r'this (famous|iconic|historic|well-known) (landmark|monument|tower|structure)', r'this urban structure'), | |
| (r'([A-Z][a-zA-Z\s]+) Tower', r'tall structure'), | |
| (r'a (tower|structure) in ([A-Z][a-zA-Z\s,]+)', r'a \1 in the area'), | |
| (r'landmark scene', r'urban scene'), | |
| (r'tourist destination', r'urban area'), | |
| (r'tourist attraction', r'urban area') | |
| ] | |
| for pattern, replacement in landmark_patterns: | |
| text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) | |
| return text | |
| def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float, | |
| lighting_info: Dict, functional_zones: List[str], enable_landmark: bool = True, | |
| scene_scores: Optional[Dict] = None, spatial_analysis: Optional[Dict] = None, | |
| image_dimensions: Optional[Dict] = None, places365_info: Optional[Dict] = None, | |
| object_statistics: Optional[Dict] = None) -> str: | |
| """ | |
| Generate enhanced scene description based on detection results, scene type, | |
| and additional contextual information. | |
| This version ensures that the main scene_details (from the first call) | |
| is properly integrated and not overwritten by a simplified second call. | |
| """ | |
| # Handle unknown scene type or very low confidence as an early exit | |
| if scene_type == "unknown" or confidence < 0.4: | |
| # _generate_generic_description should also ideally use image_dimensions if it does spatial reasoning | |
| generic_desc = self._generate_generic_description(detected_objects, lighting_info) | |
| return self._format_final_description(generic_desc) | |
| # Filter out landmark objects if landmark detection is disabled for this run | |
| current_detected_objects = detected_objects | |
| if not enable_landmark: | |
| current_detected_objects = [obj for obj in detected_objects if not obj.get("is_landmark", False)] | |
| # Log Places365 context if available | |
| places365_context = "" | |
| if places365_info and places365_info.get('confidence', 0) > 0.3: | |
| scene_label = places365_info.get('scene_label', '') | |
| attributes = places365_info.get('attributes', []) | |
| is_indoor = places365_info.get('is_indoor', None) | |
| if scene_label: | |
| places365_context = f"Scene context: {scene_label}" | |
| if attributes: | |
| places365_context += f" with characteristics: {', '.join(attributes[:3])}" | |
| if is_indoor is not None: | |
| indoor_outdoor = "indoor" if is_indoor else "outdoor" | |
| places365_context += f" ({indoor_outdoor} environment)" | |
| print(f"Enhanced description incorporating Places365 context: {places365_context}") | |
| landmark_objects_in_scene = [obj for obj in current_detected_objects if obj.get("is_landmark", False)] | |
| has_landmark_in_scene = len(landmark_objects_in_scene) > 0 | |
| # If landmark processing is enabled and it's a landmark scene or landmarks are detected | |
| if enable_landmark and (scene_type in ["tourist_landmark", "natural_landmark", "historical_monument"] or has_landmark_in_scene): | |
| landmark_desc = self._generate_landmark_description( | |
| scene_type, | |
| current_detected_objects, # Pass potentially filtered list | |
| confidence, | |
| lighting_info, | |
| functional_zones, | |
| landmark_objects_in_scene # Pass the explicitly filtered landmark objects | |
| ) | |
| return self._format_final_description(landmark_desc) | |
| # **[Start of main description construction for non-landmark or landmark-disabled everyday scenes]** | |
| # Detect viewpoint based on current (potentially filtered) objects | |
| viewpoint = self._detect_viewpoint(current_detected_objects) | |
| current_scene_type = scene_type # Use a mutable variable for scene_type if it can change | |
| # Process aerial viewpoint scene types (may re-assign current_scene_type) | |
| if viewpoint == "aerial": | |
| if "intersection" in current_scene_type.lower() or self._is_intersection(current_detected_objects): # Use lower for robustness | |
| current_scene_type = "aerial_view_intersection" | |
| elif any(keyword in current_scene_type.lower() for keyword in ["commercial", "shopping", "retail"]): | |
| current_scene_type = "aerial_view_commercial_area" | |
| elif any(keyword in current_scene_type.lower() for keyword in ["plaza", "square"]): | |
| current_scene_type = "aerial_view_plaza" | |
| else: # Default aerial if specific not matched | |
| current_scene_type = "aerial_view_general" # Or use a specific default like aerial_view_intersection | |
| # Detect cultural context (only for non-aerial viewpoints) | |
| cultural_context = None | |
| if viewpoint != "aerial": | |
| cultural_context = self._detect_cultural_context(current_scene_type, current_detected_objects) | |
| # Get base description for the (potentially updated) scene type | |
| base_description = "A scene" # Default initialization | |
| if viewpoint == "aerial": | |
| # Check if current_scene_type (which might be an aerial type) has a base description | |
| if current_scene_type in self.scene_types: | |
| base_description = self.scene_types[current_scene_type].get("description", "An aerial view showing the layout and movement patterns from above") | |
| else: | |
| base_description = "An aerial view showing the layout and movement patterns from above" | |
| elif current_scene_type in self.scene_types: | |
| base_description = self.scene_types[current_scene_type].get("description", "A scene") | |
| # spatial analysis, and image dimensions. This is where dynamic description or template filling happens. | |
| core_scene_details = self._generate_scene_details( | |
| current_scene_type, # Use the potentially updated scene_type | |
| current_detected_objects, | |
| lighting_info, | |
| viewpoint, | |
| spatial_analysis=spatial_analysis, # Pass this through | |
| image_dimensions=image_dimensions, # Pass this through | |
| places365_info=places365_info, # Pass Places365 info | |
| object_statistics=object_statistics # Pass object statistics | |
| ) | |
| # Start with the base description derived from SCENE_TYPES or a default. | |
| description = base_description | |
| if core_scene_details and core_scene_details.strip() != "": # Ensure core_scene_details is not empty | |
| # If base_description is generic like "A scene", consider replacing it or appending smartly. | |
| if base_description.lower() == "a scene" and len(core_scene_details) > len(base_description): | |
| description = core_scene_details # Prioritize dynamic/template-filled details if base is too generic | |
| else: | |
| description = self._smart_append(description, core_scene_details) | |
| elif not core_scene_details and not description: # If both are empty, use a generic fallback | |
| description = self._generate_generic_description(current_detected_objects, lighting_info) | |
| # Append secondary description from scene type template, if any | |
| if current_scene_type in self.scene_types and "secondary_description" in self.scene_types[current_scene_type]: | |
| secondary_desc = self.scene_types[current_scene_type]["secondary_description"] | |
| if secondary_desc: | |
| description = self._smart_append(description, secondary_desc) | |
| # Append people count information | |
| people_objs = [obj for obj in current_detected_objects if obj.get("class_id") == 0] | |
| if people_objs: | |
| people_count = len(people_objs) | |
| if people_count == 1: people_phrase = "a single person" | |
| elif people_count > 1 and people_count <= 3: people_phrase = f"{people_count} people" # Accurate for small counts | |
| elif people_count > 3 and people_count <=7: people_phrase = "several people" | |
| else: people_phrase = "multiple people" # For larger counts, or use "numerous" | |
| # Only add if not already well covered in core_scene_details or base_description | |
| if "person" not in description.lower() and "people" not in description.lower() and "pedestrian" not in description.lower(): | |
| description = self._smart_append(description, f"The scene includes {people_phrase}.") | |
| # Append cultural context | |
| if cultural_context and viewpoint != "aerial": # Already checked viewpoint | |
| cultural_elements = self._generate_cultural_elements(cultural_context) | |
| if cultural_elements: | |
| description = self._smart_append(description, cultural_elements) | |
| # Append lighting information | |
| lighting_description_text = "" | |
| if lighting_info and "time_of_day" in lighting_info: | |
| lighting_type = lighting_info["time_of_day"] | |
| lighting_desc_template = self.templates.get("lighting_templates", {}).get(lighting_type) | |
| if lighting_desc_template: | |
| lighting_description_text = lighting_desc_template | |
| if lighting_description_text and lighting_description_text.lower() not in description.lower(): | |
| description = self._smart_append(description, lighting_description_text) | |
| # Append viewpoint information (if not eye-level) | |
| if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}): | |
| viewpoint_template = self.templates["viewpoint_templates"][viewpoint] | |
| prefix = viewpoint_template.get('prefix', '') | |
| observation_template = viewpoint_template.get("observation", "") | |
| # Determine scene_elements for the observation template | |
| scene_elements_for_vp = "the overall layout and objects" # Generic default | |
| if viewpoint == "aerial": | |
| scene_elements_for_vp = "crossing patterns and general layout" | |
| viewpoint_observation_text = observation_template.format(scene_elements=scene_elements_for_vp) | |
| # Combine prefix and observation carefully | |
| full_viewpoint_text = "" | |
| if prefix: | |
| full_viewpoint_text = prefix.strip() + " " | |
| if viewpoint_observation_text and viewpoint_observation_text[0].islower(): | |
| full_viewpoint_text += viewpoint_observation_text | |
| elif viewpoint_observation_text: | |
| full_viewpoint_text = prefix + viewpoint_observation_text[0].lower() + viewpoint_observation_text[1:] if description else prefix + viewpoint_observation_text | |
| elif viewpoint_observation_text: # No prefix, but observation exists | |
| full_viewpoint_text = viewpoint_observation_text[0].upper() + viewpoint_observation_text[1:] | |
| if full_viewpoint_text and full_viewpoint_text.lower() not in description.lower(): | |
| description = self._smart_append(description, full_viewpoint_text) | |
| # Append functional zones information | |
| if functional_zones and len(functional_zones) > 0: | |
| zones_desc_text = self._describe_functional_zones(functional_zones) | |
| if zones_desc_text: | |
| description = self._smart_append(description, zones_desc_text) | |
| final_formatted_description = self._format_final_description(description) | |
| if not enable_landmark: | |
| final_formatted_description = self.filter_landmark_references(final_formatted_description, enable_landmark=False) | |
| # If after all processing, description is empty, fallback to a very generic one. | |
| if not final_formatted_description.strip() or final_formatted_description.strip() == ".": | |
| self.logger.warning(f"Description for scene_type '{current_scene_type}' became empty after processing. Falling back.") | |
| final_formatted_description = self._format_final_description( | |
| self._generate_generic_description(current_detected_objects, lighting_info) | |
| ) | |
| return final_formatted_description | |
| def _smart_append(self, current_text: str, new_fragment: str) -> str: | |
| """ | |
| Intelligently append a new text fragment to the current text, | |
| handling punctuation and capitalization correctly. | |
| Args: | |
| current_text: The existing text to append to | |
| new_fragment: The new text fragment to append | |
| Returns: | |
| str: The combined text with proper formatting | |
| """ | |
| # Handle empty cases | |
| if not new_fragment: | |
| return current_text | |
| if not current_text: | |
| # Ensure first character is uppercase for the first fragment | |
| return new_fragment[0].upper() + new_fragment[1:] if new_fragment else "" | |
| # Clean up existing text | |
| current_text = current_text.rstrip() | |
| # Check for ending punctuation | |
| ends_with_sentence = current_text.endswith(('.', '!', '?')) | |
| ends_with_comma = current_text.endswith(',') | |
| # Specifically handle the "A xxx A yyy" pattern that's causing issues | |
| if (current_text.startswith("A ") or current_text.startswith("An ")) and \ | |
| (new_fragment.startswith("A ") or new_fragment.startswith("An ")): | |
| return current_text + ". " + new_fragment | |
| # 檢查新片段是否包含地標名稱(通常為專有名詞) | |
| has_landmark_name = any(word[0].isupper() for word in new_fragment.split() | |
| if len(word) > 2 and not word.startswith(("A ", "An ", "The "))) | |
| # Decide how to join the texts | |
| if ends_with_sentence: | |
| # After a sentence, start with uppercase and add proper spacing | |
| joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:]) | |
| elif ends_with_comma: | |
| # After a comma, maintain flow with lowercase unless it's a proper noun or special case | |
| if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name: | |
| joined_text = current_text + " " + new_fragment | |
| else: | |
| joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:] | |
| elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower(): | |
| # When adding a new sentence about the scene, use a period | |
| joined_text = current_text + ". " + new_fragment | |
| else: | |
| # For other cases, decide based on the content | |
| if self._is_related_phrases(current_text, new_fragment): | |
| if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper() or has_landmark_name: | |
| joined_text = current_text + ", " + new_fragment | |
| else: | |
| joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:] | |
| else: | |
| # Use period for unrelated phrases | |
| joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:]) | |
| return joined_text | |
| def _is_related_phrases(self, text1: str, text2: str) -> bool: | |
| """ | |
| Determine if two phrases are related and should be connected with a comma | |
| rather than separated with a period. | |
| Args: | |
| text1: The first text fragment | |
| text2: The second text fragment to be appended | |
| Returns: | |
| bool: Whether the phrases appear to be related | |
| """ | |
| # Check if either phrase starts with "A" or "An" - these are likely separate descriptions | |
| if (text1.startswith("A ") or text1.startswith("An ")) and \ | |
| (text2.startswith("A ") or text2.startswith("An ")): | |
| return False # These are separate descriptions, not related phrases | |
| # Check if the second phrase starts with a connecting word | |
| connecting_words = ["which", "where", "who", "whom", "whose", "with", "without", | |
| "this", "these", "that", "those", "and", "or", "but"] | |
| first_word = text2.split()[0].lower() if text2 else "" | |
| if first_word in connecting_words: | |
| return True | |
| # Check if the first phrase ends with something that suggests continuity | |
| ending_patterns = ["such as", "including", "like", "especially", "particularly", | |
| "for example", "for instance", "namely", "specifically"] | |
| for pattern in ending_patterns: | |
| if text1.lower().endswith(pattern): | |
| return True | |
| # Check if both phrases are about the scene | |
| if "scene" in text1.lower() and "scene" in text2.lower(): | |
| return False # Separate statements about the scene should be separate sentences | |
| return False | |
| def _format_final_description(self, text: str) -> str: | |
| """ | |
| Format the final description text to ensure correct punctuation, | |
| capitalization, and spacing. | |
| """ | |
| if not text or not text.strip(): # Also check if text is just whitespace | |
| return "" | |
| # Trim leading/trailing whitespace first | |
| text = text.strip() | |
| # 1. Handle consecutive "A/An" segments (potentially split them into sentences) | |
| text = re.sub(r'(A\s+[^.!?]+?[\w\.])\s+(A\s+)', r'\1. \2', text, flags=re.IGNORECASE) | |
| text = re.sub(r'(An\s+[^.!?]+?[\w\.])\s+(An?\s+)', r'\1. \2', text, flags=re.IGNORECASE) | |
| # 2. Ensure first character of the entire text is uppercase | |
| if text: | |
| text = text[0].upper() + text[1:] | |
| # 3. Normalize whitespace: multiple spaces to one | |
| text = re.sub(r'\s{2,}', ' ', text) | |
| # 4. Capitalize after sentence-ending punctuation (. ! ?) | |
| def capitalize_after_punctuation(match): | |
| return match.group(1) + match.group(2).upper() | |
| text = re.sub(r'([.!?]\s+)([a-z])', capitalize_after_punctuation, text) | |
| # 5. Handle capitalization after commas (your existing robust logic is good) | |
| def fix_capitalization_after_comma(match): | |
| leading_comma_space = match.group(1) # (,\s+) | |
| word_after_comma = match.group(2) # ([A-Z][a-zA-Z]*) | |
| proper_nouns_exceptions = ["I", "I'm", "I've", "I'd", "I'll", | |
| "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", | |
| "January", "February", "March", "April", "May", "June", "July", | |
| "August", "September", "October", "November", "December"] | |
| if word_after_comma in proper_nouns_exceptions: | |
| return match.group(0) | |
| # If the word looks like a proper noun (e.g., multi-word capitalized, or a known location/brand) | |
| # This heuristic can be tricky. For simplicity, if it's already capitalized and not a common word, keep it. | |
| if len(word_after_comma) > 2 and word_after_comma[0].isupper() and word_after_comma.lower() not in ["this", "that", "these", "those", "they", "their", "then", "thus"]: | |
| return match.group(0) # Keep it if it looks like a proper noun already | |
| return leading_comma_space + word_after_comma[0].lower() + word_after_comma[1:] | |
| text = re.sub(r'(,\s+)([A-Z][a-zA-Z\'\-]+)', fix_capitalization_after_comma, text) # Added hyphen and apostrophe to word | |
| # 6. Correct spacing around punctuation | |
| text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text) # Ensures one space AFTER punctuation, none before | |
| text = text.replace(' .', '.').replace(' ,', ',') # Clean up potential space before period/comma from previous rule | |
| # 7. Consolidate multiple sentence-ending punctuations (e.g., "!!", "?.", ".?") | |
| text = re.sub(r'[.!?]{2,}', '.', text) # Convert multiple to a single period | |
| text = re.sub(r',+', ',', text) # Multiple commas to one | |
| # 8. Ensure text ends with a single sentence-ending punctuation mark | |
| text = text.strip() # Remove trailing whitespace before checking last char | |
| if text and not text[-1] in '.!?': | |
| text += '.' | |
| # 9. Remove any leading punctuation or extra spaces that might have been introduced | |
| text = re.sub(r'^[.,;:!?\s]+', '', text) | |
| # 10. Final check for first letter capitalization | |
| if text: | |
| text = text[0].upper() + text[1:] | |
| # 11. Remove space before final punctuation mark if accidentally added by rule 7 | |
| text = re.sub(r'\s+([.!?])$', r'\1', text) | |
| return text.strip() # Final strip | |
| def _is_intersection(self, detected_objects: List[Dict]) -> bool: | |
| """ | |
| 通過分析物體分佈來判斷場景是否為十字路口 | |
| """ | |
| # 檢查行人分佈模式 | |
| pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0] | |
| if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口 | |
| # 抓取行人位置 | |
| positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians] | |
| # 分析 x 和 y 坐標分佈 | |
| x_coords = [pos[0] for pos in positions] | |
| y_coords = [pos[1] for pos in positions] | |
| # 計算 x 和 y 坐標的變異數 | |
| x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 | |
| y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 | |
| # 計算範圍 | |
| x_range = max(x_coords) - min(x_coords) | |
| y_range = max(y_coords) - min(y_coords) | |
| # 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口 | |
| if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: | |
| return True | |
| return False | |
| def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str: | |
| """ | |
| Generate a generic description when scene type is unknown or confidence is very low. | |
| Args: | |
| detected_objects: List of detected objects | |
| lighting_info: Optional lighting condition information | |
| Returns: | |
| str: Generic description based on detected objects | |
| """ | |
| # Count object occurrences | |
| obj_counts = {} | |
| for obj in detected_objects: | |
| class_name = obj["class_name"] | |
| if class_name not in obj_counts: | |
| obj_counts[class_name] = 0 | |
| obj_counts[class_name] += 1 | |
| # Get top objects by count | |
| top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5] | |
| if not top_objects: | |
| base_desc = "No clearly identifiable objects are visible in this scene." | |
| else: | |
| # Format object list | |
| objects_text = [] | |
| for name, count in top_objects: | |
| if count > 1: | |
| objects_text.append(f"{count} {name}s") | |
| else: | |
| objects_text.append(name) | |
| if len(objects_text) == 1: | |
| objects_list = objects_text[0] | |
| elif len(objects_text) == 2: | |
| objects_list = f"{objects_text[0]} and {objects_text[1]}" | |
| else: | |
| objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}" | |
| base_desc = f"This scene contains {objects_list}." | |
| # Add lighting information if available | |
| if lighting_info and "time_of_day" in lighting_info: | |
| lighting_type = lighting_info["time_of_day"] | |
| if lighting_type in self.templates.get("lighting_templates", {}): | |
| lighting_desc = self.templates["lighting_templates"][lighting_type] | |
| base_desc += f" {lighting_desc}" | |
| return base_desc | |
| def _get_prominent_objects(self, detected_objects: List[Dict], min_prominence_score: float = 0.1, max_categories_to_return: int = 5, max_total_objects: int = 7) -> List[Dict]: | |
| """ | |
| Helper function to get the most prominent objects. | |
| Prioritizes high-confidence, large objects, and ensures a diversity of object types. | |
| Args: | |
| detected_objects: List of detected objects. | |
| min_prominence_score: Minimum score for an object to be considered initially. | |
| max_categories_to_return: Max number of different object categories to prioritize. | |
| max_total_objects: Overall cap on the number of prominent objects returned. | |
| Returns: | |
| List of prominent detected objects. | |
| """ | |
| if not detected_objects: | |
| return [] | |
| scored_objects = [] | |
| for obj in detected_objects: | |
| area = obj.get("normalized_area", 0.0) + 1e-6 | |
| confidence = obj.get("confidence", 0.0) | |
| # Base score: area and confidence are key | |
| score = (area * 0.65) + (confidence * 0.35) # Slightly more weight to area | |
| # Bonus for generally important object classes (in a generic way) | |
| # This is a simple heuristic. More advanced would be context-dependent. | |
| # For example, 'person' is often more salient. | |
| # Avoid hardcoding specific class_ids here if possible, or use broad categories if available. | |
| # For simplicity, we'll keep the landmark bonus for now. | |
| if obj.get("class_name") == "person": # Example: person is generally prominent | |
| score += 0.1 | |
| if obj.get("is_landmark"): # Landmarks are always prominent | |
| score += 0.5 | |
| if score >= min_prominence_score: | |
| scored_objects.append((obj, score)) | |
| if not scored_objects: | |
| return [] | |
| # Sort by score in descending order | |
| scored_objects.sort(key=lambda x: x[1], reverse=True) | |
| # Prioritize diversity of object categories first | |
| prominent_by_category = {} | |
| final_prominent_objects = [] | |
| for obj, score in scored_objects: | |
| category = obj.get("class_name", "unknown") | |
| if category not in prominent_by_category: | |
| if len(prominent_by_category) < max_categories_to_return: | |
| prominent_by_category[category] = obj | |
| final_prominent_objects.append(obj) | |
| elif len(final_prominent_objects) < max_total_objects and obj not in final_prominent_objects: | |
| if score > 0.3: | |
| final_prominent_objects.append(obj) | |
| # If still under max_total_objects, fill with highest scored remaining objects regardless of category | |
| if len(final_prominent_objects) < max_total_objects: | |
| for obj, score in scored_objects: | |
| if len(final_prominent_objects) >= max_total_objects: | |
| break | |
| if obj not in final_prominent_objects: | |
| final_prominent_objects.append(obj) | |
| # Re-sort the final list by original prominence score to maintain order | |
| final_prominent_objects_with_scores = [] | |
| for obj in final_prominent_objects: | |
| for original_obj, original_score in scored_objects: | |
| if obj is original_obj: # Check for object identity | |
| final_prominent_objects_with_scores.append((obj, original_score)) | |
| break | |
| final_prominent_objects_with_scores.sort(key=lambda x: x[1], reverse=True) | |
| return [obj for obj, score in final_prominent_objects_with_scores[:max_total_objects]] | |
| def _format_object_list_for_description(self, | |
| objects: List[Dict], | |
| use_indefinite_article_for_one: bool = False, | |
| count_threshold_for_generalization: int = -1, # Default to -1 for precise counts | |
| max_types_to_list: int = 5 | |
| ) -> str: | |
| """ | |
| Formats a list of detected objects into a human-readable string with counts. | |
| Args: | |
| objects: List of object dictionaries, each expected to have 'class_name'. | |
| use_indefinite_article_for_one: If True, uses "a/an" for single items. If False, uses "one". | |
| count_threshold_for_generalization: If count exceeds this, use general terms. -1 means precise counts. | |
| max_types_to_list: Maximum number of different object types to include in the list. | |
| """ | |
| if not objects: | |
| return "no specific objects clearly identified" | |
| counts: Dict[str, int] = {} | |
| for obj in objects: | |
| name = obj.get("class_name", "unknown object") | |
| if name == "unknown object" or not name: # Skip unknown or empty names | |
| continue | |
| counts[name] = counts.get(name, 0) + 1 | |
| if not counts: | |
| return "no specific objects clearly identified" | |
| descriptions = [] | |
| # Sort by count (desc) then name (asc) for consistent output order | |
| # Limit the number of distinct object types being listed | |
| sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))[:max_types_to_list] | |
| for name, count in sorted_counts: | |
| if count == 1: | |
| if use_indefinite_article_for_one: | |
| if name[0].lower() in 'aeiou': | |
| descriptions.append(f"an {name}") | |
| else: | |
| descriptions.append(f"a {name}") | |
| else: | |
| descriptions.append(f"one {name}") # Output "one car" instead of "a car" | |
| else: # count > 1 | |
| plural_name = name | |
| if name.endswith("y") and not name.lower().endswith(("ay", "ey", "iy", "oy", "uy")): | |
| plural_name = name[:-1] + "ies" | |
| elif name.endswith(("s", "sh", "ch", "x", "z")): | |
| plural_name = name + "es" | |
| elif not name.endswith("s"): # Avoid double 's' like "buss" | |
| plural_name = name + "s" | |
| if count_threshold_for_generalization != -1 and count > count_threshold_for_generalization: | |
| if count <= count_threshold_for_generalization + 3: | |
| descriptions.append(f"several {plural_name}") | |
| else: | |
| descriptions.append(f"many {plural_name}") | |
| else: # Use exact count (e.g., "6 cars") | |
| descriptions.append(f"{count} {plural_name}") | |
| if not descriptions: | |
| return "no specific objects clearly identified" | |
| if len(descriptions) == 1: | |
| return descriptions[0] | |
| elif len(descriptions) == 2: | |
| return f"{descriptions[0]} and {descriptions[1]}" | |
| else: | |
| # Oxford comma for lists of 3 or more. | |
| return ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}" | |
| def _get_spatial_description(self, obj: Dict, image_width: Optional[int] = None, image_height: Optional[int] = None) -> str: | |
| """ | |
| Generates a brief spatial description for an object. | |
| (This is a new helper function) | |
| """ | |
| region = obj.get("region") | |
| if region: | |
| # Convert region name to more descriptive terms | |
| region_map = { | |
| "top_left": "in the top-left", "top_center": "at the top-center", "top_right": "in the top-right", | |
| "middle_left": "on the middle-left side", "middle_center": "in the center", "middle_right": "on the middle-right side", | |
| "bottom_left": "in the bottom-left", "bottom_center": "at the bottom-center", "bottom_right": "in the bottom-right" | |
| } | |
| # More general terms if exact region is not critical | |
| if "top" in region: general_v_pos = "towards the top" | |
| elif "bottom" in region: general_v_pos = "towards the bottom" | |
| else: general_v_pos = "in the middle vertically" | |
| if "left" in region: general_h_pos = "towards the left" | |
| elif "right" in region: general_h_pos = "towards the right" | |
| else: general_h_pos = "in the center horizontally" | |
| # Prioritize specific region if available, else use general | |
| specific_desc = region_map.get(region, "") | |
| if specific_desc: | |
| return f"{specific_desc} of the frame" | |
| else: | |
| return f"{general_v_pos} and {general_h_pos} of the frame" | |
| # Fallback if region info is not detailed enough or missing | |
| # We can use normalized_center if available | |
| norm_center = obj.get("normalized_center") | |
| if norm_center and image_width and image_height: # Check if image_width/height are provided | |
| x_norm, y_norm = norm_center | |
| h_pos = "left" if x_norm < 0.4 else "right" if x_norm > 0.6 else "center" | |
| v_pos = "top" if y_norm < 0.4 else "bottom" if y_norm > 0.6 else "middle" | |
| if h_pos == "center" and v_pos == "middle": | |
| return "near the center of the image" | |
| return f"in the {v_pos}-{h_pos} area of the image" | |
| return "in the scene" # Generic fallback | |
| def _generate_dynamic_everyday_description(self, | |
| detected_objects: List[Dict], | |
| lighting_info: Optional[Dict] = None, | |
| viewpoint: str = "eye_level", | |
| spatial_analysis: Optional[Dict] = None, | |
| image_dimensions: Optional[Tuple[int, int]] = None, | |
| places365_info: Optional[Dict] = None, | |
| object_statistics: Optional[Dict] = None | |
| ) -> str: | |
| """ | |
| Dynamically generates a description for everyday scenes based on ALL relevant detected_objects, | |
| their counts, and context. | |
| It aims to describe the overall scene first, then details of object groups including accurate counts. | |
| """ | |
| description_segments = [] | |
| image_width, image_height = image_dimensions if image_dimensions else (None, None) | |
| if hasattr(self, 'logger'): | |
| self.logger.info(f"DynamicDesc: Start. Total Raw Objects: {len(detected_objects)}, View: {viewpoint}, Light: {lighting_info is not None}") | |
| # 1. Overall Ambiance (Lighting and Viewpoint) | |
| ambiance_parts = [] | |
| if lighting_info: | |
| time_of_day = lighting_info.get("time_of_day", "unknown lighting") | |
| is_indoor = lighting_info.get("is_indoor") | |
| ambiance_statement = "This is" | |
| if is_indoor is True: ambiance_statement += " an indoor scene" | |
| elif is_indoor is False: ambiance_statement += " an outdoor scene" | |
| else: ambiance_statement += " a scene" | |
| lighting_map = self.templates.get("lighting_templates", {}) | |
| readable_lighting_base = lighting_map.get(time_of_day, f"with {time_of_day.replace('_', ' ')} lighting conditions") | |
| readable_lighting = readable_lighting_base.lower().replace("the scene is captured", "").replace("the scene has", "").strip() | |
| ambiance_statement += f", likely {readable_lighting}." | |
| ambiance_parts.append(ambiance_statement) | |
| if viewpoint and viewpoint != "eye_level": | |
| vp_templates = self.templates.get("viewpoint_templates", {}) | |
| if viewpoint in vp_templates: | |
| vp_prefix = vp_templates[viewpoint].get("prefix", "").strip() | |
| if vp_prefix: | |
| if not ambiance_parts: | |
| ambiance_parts.append(f"{vp_prefix.capitalize()} the general layout of the scene is observed.") | |
| else: | |
| ambiance_parts[-1] = ambiance_parts[-1].rstrip('.') + f", viewed {vp_templates[viewpoint].get('short_desc', viewpoint)}." | |
| if ambiance_parts: | |
| description_segments.append(" ".join(ambiance_parts)) | |
| # 2. Describe ALL detected objects, grouped by class, with accurate counts and locations | |
| if not detected_objects: | |
| # This part remains, but the conditions to reach here might change based on confident_objects check | |
| if not description_segments: | |
| description_segments.append("A general scene is visible, but no specific objects were clearly identified.") | |
| else: | |
| description_segments.append("Within this setting, no specific objects were clearly identified.") | |
| else: | |
| objects_by_class: Dict[str, List[Dict]] = {} | |
| # keeping 0.25 as a placeholder | |
| confidence_filter_threshold = getattr(self, 'confidence_threshold_for_description', 0.25) | |
| confident_objects = [obj for obj in detected_objects if obj.get("confidence", 0) >= confidence_filter_threshold] | |
| if not confident_objects: | |
| # This message is more appropriate if objects existed but none met confidence | |
| no_confident_obj_msg = "While some elements might be present, no objects were identified with sufficient confidence for a detailed description." | |
| if not description_segments: description_segments.append(no_confident_obj_msg) | |
| else: description_segments.append(no_confident_obj_msg.lower().capitalize()) # Append as a new sentence | |
| else: | |
| if object_statistics: | |
| # 使用預計算的統計信息,並採用動態置信度策略 | |
| for class_name, stats in object_statistics.items(): | |
| count = stats.get("count", 0) | |
| avg_confidence = stats.get("avg_confidence", 0) | |
| # 動態調整置信度閾值:裝飾性物品使用較低閾值 | |
| dynamic_threshold = confidence_filter_threshold | |
| if class_name in ["potted plant", "vase", "clock", "book"]: | |
| dynamic_threshold = max(0.15, confidence_filter_threshold * 0.6) | |
| elif count >= 3: # 數量多的物品降低閾值 | |
| dynamic_threshold = max(0.2, confidence_filter_threshold * 0.8) | |
| if count > 0 and avg_confidence >= dynamic_threshold: | |
| matching_objects = [obj for obj in confident_objects if obj.get("class_name") == class_name] | |
| if not matching_objects: | |
| # 如果高信心度的物體中沒有,從原始列表中尋找 | |
| matching_objects = [obj for obj in detected_objects | |
| if obj.get("class_name") == class_name and obj.get("confidence", 0) >= dynamic_threshold] | |
| if matching_objects: | |
| actual_count = min(stats["count"], len(matching_objects)) | |
| objects_by_class[class_name] = matching_objects[:actual_count] | |
| else: | |
| # 回退邏輯同樣使用動態閾值 | |
| for obj in confident_objects: | |
| name = obj.get("class_name", "unknown object") | |
| if name == "unknown object" or not name: continue | |
| if name not in objects_by_class: | |
| objects_by_class[name] = [] | |
| objects_by_class[name].append(obj) | |
| if not objects_by_class: # Should be rare if confident_objects was not empty and had valid names | |
| description_segments.append("No common objects were confidently identified for detailed description.") | |
| else: | |
| def sort_key_object_groups(item_tuple: Tuple[str, List[Dict]]): | |
| class_name_key, obj_group_list = item_tuple | |
| priority = 3 # 預設優先級 | |
| count = len(obj_group_list) | |
| # 動態優先級:基於場景相關性和數量 | |
| if class_name_key == "person": | |
| priority = 0 | |
| elif class_name_key in ["dining table", "chair", "sofa", "bed"]: | |
| priority = 1 # 室內主要家具 | |
| elif class_name_key in ["car", "bus", "truck", "traffic light"]: | |
| priority = 2 # 交通相關物體 | |
| elif count >= 3: # 數量多的物體提升優先級 | |
| priority = max(1, priority - 1) | |
| elif class_name_key in ["potted plant", "vase", "clock", "book"] and count >= 2: | |
| priority = 2 # 裝飾性物品有一定數量時提升優先級 | |
| avg_area = sum(o.get("normalized_area", 0.0) for o in obj_group_list) / len(obj_group_list) if obj_group_list else 0 | |
| # 增加數量權重:多個同類物體更重要 | |
| quantity_bonus = min(count / 5.0, 1.0) # 最多1.0的加成 | |
| return (priority, -len(obj_group_list), -avg_area, -quantity_bonus) | |
| # 去除重複的邏輯 | |
| deduplicated_objects_by_class = {} | |
| processed_positions = [] | |
| for class_name, group_of_objects in objects_by_class.items(): | |
| unique_objects = [] | |
| for obj in group_of_objects: | |
| obj_position = obj.get("normalized_center", [0.5, 0.5]) | |
| is_duplicate = False | |
| # 檢查是否與已處理的物體位置重疊 | |
| for processed_pos in processed_positions: | |
| position_distance = abs(obj_position[0] - processed_pos[0]) + abs(obj_position[1] - processed_pos[1]) | |
| if position_distance < 0.15: # 位置重疊閾值 | |
| is_duplicate = True | |
| break | |
| if not is_duplicate: | |
| unique_objects.append(obj) | |
| processed_positions.append(obj_position) | |
| if unique_objects: | |
| deduplicated_objects_by_class[class_name] = unique_objects | |
| objects_by_class = deduplicated_objects_by_class | |
| sorted_object_groups = sorted(objects_by_class.items(), key=sort_key_object_groups) | |
| object_clauses = [] # Stores individual object group descriptions | |
| for class_name, group_of_objects in sorted_object_groups: | |
| count = len(group_of_objects) | |
| if count == 0: continue | |
| # 使用統計信息確保準確的數量描述 | |
| if object_statistics and class_name in object_statistics: | |
| actual_count = object_statistics[class_name]["count"] | |
| # 根據實際統計數量生成描述 | |
| if actual_count == 1: | |
| formatted_name_with_exact_count = f"one {class_name}" | |
| else: | |
| plural_form = f"{class_name}s" if not class_name.endswith('s') else class_name | |
| formatted_name_with_exact_count = f"{actual_count} {plural_form}" | |
| else: | |
| # 回退到原有的格式化邏輯 | |
| formatted_name_with_exact_count = self._format_object_list_for_description( | |
| [group_of_objects[0]] * count, | |
| use_indefinite_article_for_one=False, | |
| count_threshold_for_generalization=-1 | |
| ) | |
| if formatted_name_with_exact_count == "no specific objects clearly identified" or not formatted_name_with_exact_count: | |
| continue | |
| # Determine collective location for the group | |
| location_description_suffix = "" # e.g., "is in the center" or "are in the west area" | |
| if count == 1: | |
| location_description_suffix = f"is {self._get_spatial_description(group_of_objects[0], image_width, image_height)}" | |
| else: | |
| distinct_regions = sorted(list(set(obj.get("region", "unknown_region") for obj in group_of_objects))) | |
| known_regions = [r for r in distinct_regions if r != "unknown_region"] | |
| if not known_regions and "unknown_region" in distinct_regions: | |
| location_description_suffix = "are visible in the scene" | |
| elif len(known_regions) == 1: | |
| location_description_suffix = f"are primarily in the {known_regions[0].replace('_', ' ')} area" | |
| elif len(known_regions) == 2: | |
| location_description_suffix = f"are mainly across the {known_regions[0].replace('_',' ')} and {known_regions[1].replace('_',' ')} areas" | |
| elif len(known_regions) > 2: | |
| location_description_suffix = "are distributed in various parts of the scene" | |
| else: | |
| location_description_suffix = "are visible in the scene" | |
| # Capitalize the object description (e.g., "Six cars") | |
| formatted_name_capitalized = formatted_name_with_exact_count[0].upper() + formatted_name_with_exact_count[1:] | |
| object_clauses.append(f"{formatted_name_capitalized} {location_description_suffix}") | |
| if object_clauses: | |
| # Join object clauses into one or more sentences. | |
| if not description_segments: # If no ambiance, start with the first object clause. | |
| if object_clauses: | |
| first_clause = object_clauses.pop(0) # Take the first one out | |
| description_segments.append(first_clause + ".") | |
| else: # Ambiance exists, prepend with "The scene features..." or similar | |
| if object_clauses: | |
| description_segments.append("The scene features:") # Or "Key elements include:" | |
| # Add remaining object clauses as separate points or a continuous sentence | |
| # For now, let's join them into a single continuous sentence string to be added. | |
| if object_clauses: # If there are more clauses after the first (or after "The scene features:") | |
| joined_object_clauses = ". ".join(object_clauses) | |
| if joined_object_clauses and not joined_object_clauses.endswith("."): | |
| joined_object_clauses += "." | |
| description_segments.append(joined_object_clauses) | |
| elif not description_segments : # No ambiance and no describable objects after filtering | |
| return "The image depicts a scene, but specific objects could not be described with confidence or detail." | |
| # --- Final assembly and formatting --- | |
| # Join all collected segments. _smart_append might be better if parts are not full sentences. | |
| # Since we aim for full sentences in segments, simple join then format. | |
| raw_description = "" | |
| for i, segment in enumerate(filter(None, description_segments)): | |
| segment = segment.strip() | |
| if not segment: continue | |
| if not raw_description: # First non-empty segment | |
| raw_description = segment | |
| else: | |
| if not raw_description.endswith(('.', '!', '?')): | |
| raw_description += "." | |
| raw_description += " " + (segment[0].upper() + segment[1:] if len(segment) > 1 else segment.upper()) | |
| if raw_description and not raw_description.endswith(('.', '!', '?')): | |
| raw_description += "." | |
| final_description = self._format_final_description(raw_description) # Crucial for final polish | |
| if not final_description or len(final_description.strip()) < 20: | |
| # Fallback if description is too short or empty after processing | |
| # Use a more informative fallback if confident_objects existed | |
| if 'confident_objects' in locals() and confident_objects: | |
| return "The scene contains several detected objects, but a detailed textual description could not be fully constructed." | |
| else: | |
| return "A general scene is depicted with no objects identified with high confidence." | |
| return final_description | |
| def _generate_scene_details(self, | |
| scene_type: str, | |
| detected_objects: List[Dict], | |
| lighting_info: Optional[Dict] = None, | |
| viewpoint: str = "eye_level", | |
| spatial_analysis: Optional[Dict] = None, | |
| image_dimensions: Optional[Tuple[int, int]] = None, | |
| places365_info: Optional[Dict] = None, | |
| object_statistics: Optional[Dict] = None | |
| ) -> str: | |
| """ | |
| Generate detailed description based on scene type and detected objects. | |
| Enhanced to handle everyday scenes dynamically with accurate object counting. | |
| Args: | |
| scene_type: Identified scene type. | |
| detected_objects: List of detected objects. | |
| lighting_info: Optional lighting condition information. | |
| viewpoint: Detected viewpoint (aerial, eye_level, etc.). | |
| spatial_analysis: Optional results from SpatialAnalyzer. | |
| image_dimensions: Optional tuple of (image_width, image_height). | |
| places365_info: Optional Places365 scene classification results. | |
| object_statistics: Optional detailed object statistics with counts and confidence. | |
| Returns: | |
| str: Detailed scene description. | |
| """ | |
| scene_details = "" | |
| scene_templates = self.templates.get("scene_detail_templates", {}) | |
| # List of scene types considered "everyday" or generic | |
| everyday_scene_types = [ | |
| "general_indoor_space", "generic_street_view", | |
| "desk_area_workspace", "outdoor_gathering_spot", | |
| "kitchen_counter_or_utility_area", "unknown" | |
| ] | |
| # Extract Places365 attributes for enhanced description | |
| places365_attributes = [] | |
| scene_specific_details = "" | |
| if places365_info and places365_info.get('confidence', 0) > 0.4: | |
| attributes = places365_info.get('attributes', []) | |
| scene_label = places365_info.get('scene_label', '') | |
| # Filter relevant attributes for description enhancement | |
| relevant_attributes = [attr for attr in attributes if attr in [ | |
| 'natural_lighting', 'artificial_lighting', 'commercial', 'residential', | |
| 'workplace', 'recreational', 'educational', 'open_space', 'enclosed_space' | |
| ]] | |
| places365_attributes = relevant_attributes[:2] | |
| # Generate scene-specific contextual details using object statistics | |
| if object_statistics: | |
| if 'commercial' in attributes and object_statistics.get('person', {}).get('count', 0) > 0: | |
| person_count = object_statistics['person']['count'] | |
| if person_count == 1: | |
| scene_specific_details = "This appears to be an active commercial environment with a customer present." | |
| else: | |
| scene_specific_details = f"This appears to be an active commercial environment with {person_count} people present." | |
| elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']: | |
| scene_specific_details = "The setting suggests a comfortable residential living space." | |
| elif 'workplace' in attributes and any(object_statistics.get(obj, {}).get('count', 0) > 0 | |
| for obj in ['laptop', 'keyboard', 'monitor']): | |
| scene_specific_details = "The environment indicates an active workspace or office setting." | |
| else: | |
| # Fallback to original logic if object_statistics not available | |
| if 'commercial' in attributes and any(obj['class_name'] in ['person', 'chair', 'table'] for obj in detected_objects): | |
| scene_specific_details = "This appears to be an active commercial environment with customer activity." | |
| elif 'residential' in attributes and scene_type in ['living_room', 'bedroom', 'kitchen']: | |
| scene_specific_details = "The setting suggests a comfortable residential living space." | |
| elif 'workplace' in attributes and any(obj['class_name'] in ['laptop', 'keyboard', 'monitor'] for obj in detected_objects): | |
| scene_specific_details = "The environment indicates an active workspace or office setting." | |
| # Determine scene description approach | |
| is_confident_specific_scene = scene_type not in everyday_scene_types and scene_type in scene_templates | |
| treat_as_everyday = scene_type in everyday_scene_types | |
| if hasattr(self, 'enable_landmark') and not self.enable_landmark: | |
| if scene_type not in ["kitchen", "bedroom", "living_room", "office_workspace", "dining_area", "professional_kitchen"]: | |
| treat_as_everyday = True | |
| if treat_as_everyday or not is_confident_specific_scene: | |
| # Generate dynamic description for everyday scenes with object statistics | |
| self.logger.info(f"Generating dynamic description for scene_type: {scene_type}") | |
| scene_details = self._generate_dynamic_everyday_description( | |
| detected_objects, | |
| lighting_info, | |
| viewpoint, | |
| spatial_analysis, | |
| image_dimensions, | |
| places365_info, | |
| object_statistics # Pass object statistics to dynamic description | |
| ) | |
| elif scene_type in scene_templates: | |
| # Use template-based description with enhanced object information | |
| self.logger.info(f"Using template for scene_type: {scene_type}") | |
| viewpoint_key = f"{scene_type}_{viewpoint}" | |
| templates_list = scene_templates.get(viewpoint_key, scene_templates.get(scene_type, [])) | |
| if templates_list: | |
| detail_template = random.choice(templates_list) | |
| scene_details = self._fill_detail_template( | |
| detail_template, | |
| detected_objects, | |
| scene_type, | |
| places365_info, | |
| object_statistics # Pass object statistics to template filling | |
| ) | |
| else: | |
| scene_details = self._generate_dynamic_everyday_description( | |
| detected_objects, lighting_info, viewpoint, spatial_analysis, | |
| image_dimensions, places365_info, object_statistics | |
| ) | |
| else: | |
| # Fallback to dynamic description with object statistics | |
| self.logger.info(f"No specific template for {scene_type}, generating dynamic description.") | |
| scene_details = self._generate_dynamic_everyday_description( | |
| detected_objects, lighting_info, viewpoint, spatial_analysis, | |
| image_dimensions, places365_info, object_statistics | |
| ) | |
| # Filter out landmark references if landmark detection is disabled | |
| if hasattr(self, 'enable_landmark') and not self.enable_landmark: | |
| scene_details = self.filter_landmark_references(scene_details, enable_landmark=False) | |
| return scene_details if scene_details else "A scene with some visual elements." | |
| def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str, places365_info: Optional[Dict] = None, object_statistics: Optional[Dict] = None) -> str: | |
| """ | |
| Fill a template with specific details based on detected objects. | |
| Args: | |
| template: Template string with placeholders | |
| detected_objects: List of detected objects | |
| scene_type: Identified scene type | |
| Returns: | |
| str: Filled template | |
| """ | |
| # Find placeholders in the template using simple {placeholder} syntax | |
| import re | |
| placeholders = re.findall(r'\{([^}]+)\}', template) | |
| filled_template = template | |
| # Get object template fillers | |
| fillers = self.templates.get("object_template_fillers", {}) | |
| # 基於物品的統計資訊形成更準確的模板填充內容 | |
| statistics_based_replacements = {} | |
| if object_statistics: | |
| # 根據統計信息生成具體的物體描述 | |
| for class_name, stats in object_statistics.items(): | |
| count = stats.get("count", 0) | |
| if count > 0: | |
| # 為常見物體類別生成基於統計的描述 | |
| if class_name == "potted plant": | |
| if count == 1: | |
| statistics_based_replacements["plant_elements"] = "a potted plant" | |
| elif count <= 3: | |
| statistics_based_replacements["plant_elements"] = f"{count} potted plants" | |
| else: | |
| statistics_based_replacements["plant_elements"] = f"multiple potted plants ({count} total)" | |
| elif class_name == "chair": | |
| if count == 1: | |
| statistics_based_replacements["seating"] = "a chair" | |
| elif count <= 4: | |
| statistics_based_replacements["seating"] = f"{count} chairs" | |
| else: | |
| statistics_based_replacements["seating"] = f"numerous chairs ({count} total)" | |
| elif class_name == "person": | |
| if count == 1: | |
| statistics_based_replacements["people_and_vehicles"] = "a person" | |
| statistics_based_replacements["pedestrian_flow"] = "an individual walking" | |
| elif count <= 5: | |
| statistics_based_replacements["people_and_vehicles"] = f"{count} people" | |
| statistics_based_replacements["pedestrian_flow"] = f"{count} people walking" | |
| else: | |
| statistics_based_replacements["people_and_vehicles"] = f"many people ({count} individuals)" | |
| statistics_based_replacements["pedestrian_flow"] = f"a crowd of {count} people" | |
| # 為所有可能的變數設置默認值 | |
| default_replacements = { | |
| # 室內相關 | |
| "furniture": "various furniture pieces", | |
| "seating": "comfortable seating", | |
| "electronics": "entertainment devices", | |
| "bed_type": "a bed", | |
| "bed_location": "room", | |
| "bed_description": "sleeping arrangements", | |
| "extras": "personal items", | |
| "table_setup": "a dining table and chairs", | |
| "table_description": "a dining surface", | |
| "dining_items": "dining furniture and tableware", | |
| "appliances": "kitchen appliances", | |
| "kitchen_items": "cooking utensils and dishware", | |
| "cooking_equipment": "cooking equipment", | |
| "office_equipment": "work-related furniture and devices", | |
| "desk_setup": "a desk and chair", | |
| "computer_equipment": "electronic devices", | |
| # 室外/城市相關 | |
| "traffic_description": "vehicles and pedestrians", | |
| "people_and_vehicles": "people and various vehicles", | |
| "street_elements": "urban infrastructure", | |
| "park_features": "benches and greenery", | |
| "outdoor_elements": "natural features", | |
| "park_description": "outdoor amenities", | |
| "store_elements": "merchandise displays", | |
| "shopping_activity": "customers browse and shop", | |
| "store_items": "products for sale", | |
| # 高級餐廳相關 | |
| "design_elements": "elegant decor", | |
| "lighting": "stylish lighting fixtures", | |
| # 亞洲商業街相關 | |
| "storefront_features": "compact shops", | |
| "pedestrian_flow": "people walking", | |
| "asian_elements": "distinctive cultural elements", | |
| "cultural_elements": "traditional design features", | |
| "signage": "colorful signs", | |
| "street_activities": "busy urban activity", | |
| # 金融區相關 | |
| "buildings": "tall buildings", | |
| "traffic_elements": "vehicles", | |
| "skyscrapers": "high-rise buildings", | |
| "road_features": "wide streets", | |
| "architectural_elements": "modern architecture", | |
| "city_landmarks": "prominent structures", | |
| # 十字路口相關 | |
| "crossing_pattern": "marked pedestrian crossings", | |
| "pedestrian_behavior": "careful walking", | |
| "pedestrian_density": "groups of pedestrians", | |
| "traffic_pattern": "regulated traffic flow", | |
| # 交通樞紐相關 | |
| "transit_vehicles": "public transportation vehicles", | |
| "passenger_activity": "commuter movement", | |
| "transportation_modes": "various transit options", | |
| "passenger_needs": "waiting areas", | |
| "transit_infrastructure": "transit facilities", | |
| "passenger_movement": "commuter flow", | |
| # 購物區相關 | |
| "retail_elements": "shops and displays", | |
| "store_types": "various retail establishments", | |
| "walkway_features": "pedestrian pathways", | |
| "commercial_signage": "store signs", | |
| "consumer_behavior": "shopping activities", | |
| # 空中視角相關 | |
| "commercial_layout": "organized retail areas", | |
| "pedestrian_pattern": "people movement patterns", | |
| "gathering_features": "public gathering spaces", | |
| "movement_pattern": "crowd flow patterns", | |
| "urban_elements": "city infrastructure", | |
| "public_activity": "social interaction", | |
| # 文化特定元素 | |
| "stall_elements": "vendor booths", | |
| "lighting_features": "decorative lights", | |
| "food_elements": "food offerings", | |
| "vendor_stalls": "market stalls", | |
| "nighttime_activity": "evening commerce", | |
| "cultural_lighting": "traditional lighting", | |
| "night_market_sounds": "lively market sounds", | |
| "evening_crowd_behavior": "nighttime social activity", | |
| "architectural_elements": "cultural buildings", | |
| "religious_structures": "sacred buildings", | |
| "decorative_features": "ornamental designs", | |
| "cultural_practices": "traditional activities", | |
| "temple_architecture": "religious structures", | |
| "sensory_elements": "atmospheric elements", | |
| "visitor_activities": "cultural experiences", | |
| "ritual_activities": "ceremonial practices", | |
| "cultural_symbols": "meaningful symbols", | |
| "architectural_style": "historical buildings", | |
| "historic_elements": "traditional architecture", | |
| "urban_design": "city planning elements", | |
| "social_behaviors": "public interactions", | |
| "european_features": "European architectural details", | |
| "tourist_activities": "visitor activities", | |
| "local_customs": "regional practices", | |
| # 時間特定元素 | |
| "lighting_effects": "artificial lighting", | |
| "shadow_patterns": "light and shadow", | |
| "urban_features": "city elements", | |
| "illuminated_elements": "lit structures", | |
| "evening_activities": "nighttime activities", | |
| "light_sources": "lighting points", | |
| "lit_areas": "illuminated spaces", | |
| "shadowed_zones": "darker areas", | |
| "illuminated_signage": "bright signs", | |
| "colorful_lighting": "multicolored lights", | |
| "neon_elements": "neon signs", | |
| "night_crowd_behavior": "evening social patterns", | |
| "light_displays": "lighting installations", | |
| "building_features": "architectural elements", | |
| "nightlife_activities": "evening entertainment", | |
| "lighting_modifier": "bright", | |
| # 混合環境元素 | |
| "transitional_elements": "connecting features", | |
| "indoor_features": "interior elements", | |
| "outdoor_setting": "exterior spaces", | |
| "interior_amenities": "inside comforts", | |
| "exterior_features": "outside elements", | |
| "inside_elements": "interior design", | |
| "outside_spaces": "outdoor areas", | |
| "dual_environment_benefits": "combined settings", | |
| "passenger_activities": "waiting behaviors", | |
| "transportation_types": "transit vehicles", | |
| "sheltered_elements": "covered areas", | |
| "exposed_areas": "open sections", | |
| "waiting_behaviors": "passenger activities", | |
| "indoor_facilities": "inside services", | |
| "platform_features": "transit platform elements", | |
| "transit_routines": "transportation procedures", | |
| # 專門場所元素 | |
| "seating_arrangement": "spectator seating", | |
| "playing_surface": "athletic field", | |
| "sporting_activities": "sports events", | |
| "spectator_facilities": "viewer accommodations", | |
| "competition_space": "sports arena", | |
| "sports_events": "athletic competitions", | |
| "viewing_areas": "audience sections", | |
| "field_elements": "field markings and equipment", | |
| "game_activities": "competitive play", | |
| "construction_equipment": "building machinery", | |
| "building_materials": "construction supplies", | |
| "construction_activities": "building work", | |
| "work_elements": "construction tools", | |
| "structural_components": "building structures", | |
| "site_equipment": "construction gear", | |
| "raw_materials": "building supplies", | |
| "construction_process": "building phases", | |
| "medical_elements": "healthcare equipment", | |
| "clinical_activities": "medical procedures", | |
| "facility_design": "healthcare layout", | |
| "healthcare_features": "medical facilities", | |
| "patient_interactions": "care activities", | |
| "equipment_types": "medical devices", | |
| "care_procedures": "health services", | |
| "treatment_spaces": "clinical areas", | |
| "educational_furniture": "learning furniture", | |
| "learning_activities": "educational practices", | |
| "instructional_design": "teaching layout", | |
| "classroom_elements": "school equipment", | |
| "teaching_methods": "educational approaches", | |
| "student_engagement": "learning participation", | |
| "learning_spaces": "educational areas", | |
| "educational_tools": "teaching resources", | |
| "knowledge_transfer": "learning exchanges" | |
| } | |
| # 將統計的資訊形成的替換內容合併到默認替換中 | |
| default_replacements.update(statistics_based_replacements) | |
| # Add Places365-specific template variables | |
| places365_scene_context = "" | |
| places365_atmosphere = "" | |
| if places365_info and places365_info.get('confidence', 0) > 0.35: | |
| scene_label = places365_info.get('scene_label', '').replace('_', ' ') | |
| attributes = places365_info.get('attributes', []) | |
| if scene_label and scene_label != scene_type: | |
| places365_scene_context = f"characteristic of a {scene_label}" | |
| if 'natural_lighting' in attributes: | |
| places365_atmosphere = "with natural illumination" | |
| elif 'artificial_lighting' in attributes: | |
| places365_atmosphere = "under artificial lighting" | |
| # Update default_replacements with Places365 context | |
| if places365_scene_context: | |
| default_replacements["places365_context"] = places365_scene_context | |
| else: | |
| default_replacements["places365_context"] = "" | |
| if places365_atmosphere: | |
| default_replacements["places365_atmosphere"] = places365_atmosphere | |
| else: | |
| default_replacements["places365_atmosphere"] = "" | |
| # For each placeholder, try to fill with appropriate content | |
| for placeholder in placeholders: | |
| if placeholder in fillers: | |
| # Get random filler for this placeholder | |
| options = fillers[placeholder] | |
| if options: | |
| # Select 1-3 items from the options list | |
| num_items = min(len(options), random.randint(1, 3)) | |
| selected_items = random.sample(options, num_items) | |
| # Create a formatted list | |
| if len(selected_items) == 1: | |
| replacement = selected_items[0] | |
| elif len(selected_items) == 2: | |
| replacement = f"{selected_items[0]} and {selected_items[1]}" | |
| else: | |
| replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}" | |
| # Replace the placeholder | |
| filled_template = filled_template.replace(f"{{{placeholder}}}", replacement) | |
| else: | |
| # Try to fill with scene-specific logic | |
| replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type) | |
| if replacement: | |
| filled_template = filled_template.replace(f"{{{placeholder}}}", replacement) | |
| elif placeholder in default_replacements: | |
| # Use default replacement if available | |
| filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder]) | |
| else: | |
| # Last resort default | |
| filled_template = filled_template.replace(f"{{{placeholder}}}", "various items") | |
| return filled_template | |
| def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str: | |
| """ | |
| Generate content for a template placeholder based on scene-specific logic. | |
| Args: | |
| placeholder: Template placeholder | |
| detected_objects: List of detected objects | |
| scene_type: Identified scene type | |
| Returns: | |
| str: Content for the placeholder | |
| """ | |
| # Handle different types of placeholders with custom logic | |
| if placeholder == "furniture": | |
| # Extract furniture items | |
| furniture_ids = [56, 57, 58, 59, 60, 61] # Example furniture IDs | |
| furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids] | |
| if furniture_objects: | |
| furniture_names = [obj["class_name"] for obj in furniture_objects[:3]] | |
| return ", ".join(set(furniture_names)) | |
| return "various furniture items" | |
| elif placeholder == "electronics": | |
| # Extract electronic items | |
| electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # Example electronics IDs | |
| electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids] | |
| if electronics_objects: | |
| electronics_names = [obj["class_name"] for obj in electronics_objects[:3]] | |
| return ", ".join(set(electronics_names)) | |
| return "electronic devices" | |
| elif placeholder == "people_count": | |
| # Count people | |
| people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) | |
| if people_count == 0: | |
| return "no people" | |
| elif people_count == 1: | |
| return "one person" | |
| elif people_count < 5: | |
| return f"{people_count} people" | |
| else: | |
| return "several people" | |
| elif placeholder == "seating": | |
| # Extract seating items | |
| seating_ids = [56, 57] # chair, sofa | |
| seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids] | |
| if seating_objects: | |
| seating_names = [obj["class_name"] for obj in seating_objects[:2]] | |
| return ", ".join(set(seating_names)) | |
| return "seating arrangements" | |
| # Default case - empty string | |
| return "" | |
| def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str: | |
| """ | |
| Generate basic details when templates aren't available. | |
| Args: | |
| scene_type: Identified scene type | |
| detected_objects: List of detected objects | |
| Returns: | |
| str: Basic scene details | |
| """ | |
| # Handle specific scene types with custom logic | |
| if scene_type == "living_room": | |
| tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62] # TV | |
| sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57] # Sofa | |
| if tv_objs and sofa_objs: | |
| tv_region = tv_objs[0]["region"] | |
| sofa_region = sofa_objs[0]["region"] | |
| arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, " | |
| arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. " | |
| return f"{arrangement}This appears to be a space designed for relaxation and entertainment." | |
| elif scene_type == "bedroom": | |
| bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed | |
| if bed_objs: | |
| bed_region = bed_objs[0]["region"] | |
| extra_items = [] | |
| for obj in detected_objects: | |
| if obj["class_id"] == 74: # Clock | |
| extra_items.append("clock") | |
| elif obj["class_id"] == 73: # Book | |
| extra_items.append("book") | |
| extras = "" | |
| if extra_items: | |
| extras = f" There is also a {' and a '.join(extra_items)} visible." | |
| return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}" | |
| elif scene_type in ["dining_area", "kitchen"]: | |
| # Count food and dining-related items | |
| food_items = [] | |
| for obj in detected_objects: | |
| if obj["class_id"] in [39, 41, 42, 43, 44, 45]: # Kitchen items | |
| food_items.append(obj["class_name"]) | |
| food_str = "" | |
| if food_items: | |
| unique_items = list(set(food_items)) | |
| if len(unique_items) <= 3: | |
| food_str = f" with {', '.join(unique_items)}" | |
| else: | |
| food_str = f" with {', '.join(unique_items[:3])} and other items" | |
| return f"{food_str}." | |
| elif scene_type == "city_street": | |
| # Count people and vehicles | |
| people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) | |
| vehicle_count = len([obj for obj in detected_objects | |
| if obj["class_id"] in [1, 2, 3, 5, 7]]) # Bicycle, car, motorbike, bus, truck | |
| traffic_desc = "" | |
| if people_count > 0 and vehicle_count > 0: | |
| traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and " | |
| traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" | |
| elif people_count > 0: | |
| traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}" | |
| elif vehicle_count > 0: | |
| traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" | |
| return f"{traffic_desc}." | |
| # Handle more specialized scenes | |
| elif scene_type == "asian_commercial_street": | |
| # Look for key urban elements | |
| people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) | |
| vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]]) | |
| # Analyze pedestrian distribution | |
| people_positions = [] | |
| for obj in detected_objects: | |
| if obj["class_id"] == 0: # Person | |
| people_positions.append(obj["normalized_center"]) | |
| # Check if people are distributed along a line (indicating a walking path) | |
| structured_path = False | |
| if len(people_positions) >= 3: | |
| # Simplified check - see if y-coordinates are similar for multiple people | |
| y_coords = [pos[1] for pos in people_positions] | |
| y_mean = sum(y_coords) / len(y_coords) | |
| y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords) | |
| if y_variance < 0.05: # Low variance indicates linear arrangement | |
| structured_path = True | |
| street_desc = "A commercial street with " | |
| if people_count > 0: | |
| street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}" | |
| if vehicle_count > 0: | |
| street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" | |
| elif vehicle_count > 0: | |
| street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" | |
| else: | |
| street_desc += "various commercial elements" | |
| if structured_path: | |
| street_desc += ". The pedestrians appear to be following a defined walking path" | |
| # Add cultural elements | |
| street_desc += ". The signage and architectural elements suggest an Asian urban setting." | |
| return street_desc | |
| # Default general description | |
| return "The scene contains various elements characteristic of this environment." | |
| def _detect_viewpoint(self, detected_objects: List[Dict]) -> str: | |
| """ | |
| 改進視角檢測,特別加強對空中俯視視角的識別。 | |
| Args: | |
| detected_objects: 檢測到的物體列表 | |
| Returns: | |
| str: 檢測到的視角類型 | |
| """ | |
| if not detected_objects: | |
| return "eye_level" # default | |
| # extract space and size | |
| top_region_count = 0 | |
| bottom_region_count = 0 | |
| total_objects = len(detected_objects) | |
| # 追蹤大小分布以檢測空中視角 | |
| sizes = [] | |
| # 垂直大小比例用於低角度檢測 | |
| height_width_ratios = [] | |
| # 用於檢測規則圖案的變數 | |
| people_positions = [] | |
| crosswalk_pattern_detected = False | |
| for obj in detected_objects: | |
| # 計算頂部or底部區域中的物體 | |
| region = obj["region"] | |
| if "top" in region: | |
| top_region_count += 1 | |
| elif "bottom" in region: | |
| bottom_region_count += 1 | |
| # 計算標準化大小(Area) | |
| if "normalized_area" in obj: | |
| sizes.append(obj["normalized_area"]) | |
| # 計算高度or寬度比例 | |
| if "normalized_size" in obj: | |
| width, height = obj["normalized_size"] | |
| if width > 0: | |
| height_width_ratios.append(height / width) | |
| # 收集人的位置 | |
| if obj["class_id"] == 0: # 人 | |
| if "normalized_center" in obj: | |
| people_positions.append(obj["normalized_center"]) | |
| # 專門為斑馬線的十字路口添加檢測邏輯 | |
| # 檢查是否有明顯的垂直和水平行人分布 | |
| people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人 | |
| if len(people_objs) >= 8: # 需要足夠多的人才能形成十字路口模式 | |
| # 檢查是否有斑馬線模式 - 新增功能 | |
| if len(people_positions) >= 4: | |
| # 對位置進行聚類分析,尋找線性分布 | |
| x_coords = [pos[0] for pos in people_positions] | |
| y_coords = [pos[1] for pos in people_positions] | |
| # 計算 x 和 y 坐標的變異數和範圍 | |
| x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 | |
| y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 | |
| x_range = max(x_coords) - min(x_coords) | |
| y_range = max(y_coords) - min(y_coords) | |
| # 嘗試檢測十字形分布 | |
| # 如果 x 和 y 方向都有較大範圍,且範圍相似,就有可能是十字路口 | |
| if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: | |
| # 計算到中心點的距離 | |
| center_x = np.mean(x_coords) | |
| center_y = np.mean(y_coords) | |
| # 將點映射到十字架的軸上(水平和垂直) | |
| x_axis_distance = [abs(x - center_x) for x in x_coords] | |
| y_axis_distance = [abs(y - center_y) for y in y_coords] | |
| # 點應該接近軸線(水平或垂直) | |
| # 對於每個點,檢查它是否接近水平或垂直軸線 | |
| close_to_axis_count = 0 | |
| for i in range(len(x_coords)): | |
| if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1: | |
| close_to_axis_count += 1 | |
| # 如果足夠多的點接近軸線,認為是十字路口 | |
| if close_to_axis_count >= len(x_coords) * 0.6: | |
| crosswalk_pattern_detected = True | |
| # 如果沒有檢測到十字形,嘗試檢測線性聚類分布 | |
| if not crosswalk_pattern_detected: | |
| # 檢查 x 和 y 方向的聚類 | |
| x_clusters = self._detect_linear_clusters(x_coords) | |
| y_clusters = self._detect_linear_clusters(y_coords) | |
| # 如果在 x 和 y 方向上都有多個聚類,可能是交叉的斑馬線 | |
| if len(x_clusters) >= 2 and len(y_clusters) >= 2: | |
| crosswalk_pattern_detected = True | |
| # 檢測斑馬線模式 - 優先判斷 | |
| if crosswalk_pattern_detected: | |
| return "aerial" | |
| # 檢測行人分布情況 | |
| if len(people_objs) >= 10: | |
| people_region_counts = {} | |
| for obj in people_objs: | |
| region = obj["region"] | |
| if region not in people_region_counts: | |
| people_region_counts[region] = 0 | |
| people_region_counts[region] += 1 | |
| # 計算不同區域中的行人數量 | |
| region_count = len([r for r, c in people_region_counts.items() if c >= 2]) | |
| # 如果行人分布在多個區域中,可能是空中視角 | |
| if region_count >= 4: | |
| # 檢查行人分布的模式 | |
| # 特別是檢查不同區域中行人數量的差異 | |
| region_counts = list(people_region_counts.values()) | |
| region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0 | |
| region_counts_mean = np.mean(region_counts) if region_counts else 0 | |
| # 如果行人分布較為均勻(變異係數小),可能是空中視角 | |
| if region_counts_mean > 0: | |
| variation_coefficient = region_counts_variance / region_counts_mean | |
| if variation_coefficient < 0.5: | |
| return "aerial" | |
| # 計算指標 | |
| top_ratio = top_region_count / total_objects if total_objects > 0 else 0 | |
| bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0 | |
| # 大小變異數(標準化) | |
| size_variance = 0 | |
| if sizes: | |
| mean_size = sum(sizes) / len(sizes) | |
| size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes) | |
| size_variance = size_variance / (mean_size ** 2) # 標準化 | |
| # 平均高度/寬度比例 | |
| avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0 | |
| # 空中視角:低大小差異,物體均勻分布,底部很少或沒有物體 | |
| if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and | |
| bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]): | |
| return "aerial" | |
| # 低角度視角:物體傾向於比寬高,頂部較多物體 | |
| elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and | |
| top_ratio > self.viewpoint_params["low_angle_threshold"]): | |
| return "low_angle" | |
| # 高視角:底部較多物體,頂部較少 | |
| elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and | |
| top_ratio < self.viewpoint_params["elevated_top_threshold"]): | |
| return "elevated" | |
| # 默認:平視角 | |
| return "eye_level" | |
| def _detect_linear_clusters(self, coords, threshold=0.05): | |
| """ | |
| 檢測坐標中的線性聚類 | |
| Args: | |
| coords: 一維坐標列表 | |
| threshold: 聚類閾值 | |
| Returns: | |
| list: 聚類列表 | |
| """ | |
| if not coords: | |
| return [] | |
| # 排序坐標 | |
| sorted_coords = sorted(coords) | |
| clusters = [] | |
| current_cluster = [sorted_coords[0]] | |
| for i in range(1, len(sorted_coords)): | |
| # 如果當前坐標與前一個接近,添加到當前聚類 | |
| if sorted_coords[i] - sorted_coords[i-1] < threshold: | |
| current_cluster.append(sorted_coords[i]) | |
| else: | |
| # 否則開始新的聚類 | |
| if len(current_cluster) >= 2: # 至少需要2個點形成聚類 | |
| clusters.append(current_cluster) | |
| current_cluster = [sorted_coords[i]] | |
| # 添加最後一個cluster | |
| if len(current_cluster) >= 2: | |
| clusters.append(current_cluster) | |
| return clusters | |
| def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]: | |
| """ | |
| Detect the likely cultural context of the scene. | |
| Args: | |
| scene_type: Identified scene type | |
| detected_objects: List of detected objects | |
| Returns: | |
| Optional[str]: Detected cultural context (asian, european, etc.) or None | |
| """ | |
| # Scene types with explicit cultural contexts | |
| cultural_scene_mapping = { | |
| "asian_commercial_street": "asian", | |
| "asian_night_market": "asian", | |
| "asian_temple_area": "asian", | |
| "european_plaza": "european" | |
| } | |
| # Check if scene type directly indicates cultural context | |
| if scene_type in cultural_scene_mapping: | |
| return cultural_scene_mapping[scene_type] | |
| # No specific cultural context detected | |
| return None | |
| def _generate_cultural_elements(self, cultural_context: str) -> str: | |
| """ | |
| Generate description of cultural elements for the detected context. | |
| Args: | |
| cultural_context: Detected cultural context | |
| Returns: | |
| str: Description of cultural elements | |
| """ | |
| # Get template for this cultural context | |
| cultural_templates = self.templates.get("cultural_templates", {}) | |
| if cultural_context in cultural_templates: | |
| template = cultural_templates[cultural_context] | |
| elements = template.get("elements", []) | |
| if elements: | |
| # Select 1-2 random elements | |
| num_elements = min(len(elements), random.randint(1, 2)) | |
| selected_elements = random.sample(elements, num_elements) | |
| # Format elements list | |
| elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0] | |
| # Fill template | |
| return template.get("description", "").format(elements=elements_text) | |
| return "" | |
| def _optimize_object_description(self, description: str) -> str: | |
| """ | |
| 優化物品描述,避免重複列舉相同物品 | |
| """ | |
| import re | |
| # 處理床鋪重複描述 | |
| if "bed in the room" in description: | |
| description = description.replace("a bed in the room", "a bed") | |
| # 處理重複的物品列表 | |
| object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description) | |
| for obj_list in object_lists: | |
| # 計算每個物品出現次數 | |
| items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list) | |
| item_counts = {} | |
| for item in items: | |
| item = item.strip() | |
| if item and item not in ["and", "with"]: | |
| if item not in item_counts: | |
| item_counts[item] = 0 | |
| item_counts[item] += 1 | |
| # 生成優化後的物品列表 | |
| if item_counts: | |
| new_items = [] | |
| for item, count in item_counts.items(): | |
| if count > 1: | |
| new_items.append(f"{count} {item}s") | |
| else: | |
| new_items.append(item) | |
| # 格式化新列表 | |
| if len(new_items) == 1: | |
| new_list = new_items[0] | |
| elif len(new_items) == 2: | |
| new_list = f"{new_items[0]} and {new_items[1]}" | |
| else: | |
| new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}" | |
| # 替換原始列表 | |
| description = description.replace(obj_list, new_list) | |
| return description | |
| def _describe_functional_zones(self, functional_zones: Dict) -> str: | |
| """ | |
| 生成場景功能區域的描述,優化處理行人區域、人數統計和物品重複問題。 | |
| Args: | |
| functional_zones: 識別出的功能區域字典 | |
| Returns: | |
| str: 功能區域描述 | |
| """ | |
| if not functional_zones: | |
| return "" | |
| # 處理不同類型的 functional_zones 參數 | |
| if isinstance(functional_zones, list): | |
| # 如果是列表,轉換為字典格式 | |
| zones_dict = {} | |
| for i, zone in enumerate(functional_zones): | |
| if isinstance(zone, dict) and 'name' in zone: | |
| zone_name = zone['name'] | |
| else: | |
| zone_name = f"zone_{i}" | |
| zones_dict[zone_name] = zone if isinstance(zone, dict) else {"description": str(zone)} | |
| functional_zones = zones_dict | |
| elif not isinstance(functional_zones, dict): | |
| return "" | |
| # 計算場景中的總人數 | |
| total_people_count = 0 | |
| people_by_zone = {} | |
| # 計算每個區域的人數並累計總人數 | |
| for zone_name, zone_info in functional_zones.items(): | |
| if "objects" in zone_info: | |
| zone_people_count = zone_info["objects"].count("person") | |
| people_by_zone[zone_name] = zone_people_count | |
| total_people_count += zone_people_count | |
| # 分類區域為行人區域和其他區域 | |
| pedestrian_zones = [] | |
| other_zones = [] | |
| for zone_name, zone_info in functional_zones.items(): | |
| # 檢查是否是行人相關區域 | |
| if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]): | |
| pedestrian_zones.append((zone_name, zone_info)) | |
| else: | |
| other_zones.append((zone_name, zone_info)) | |
| # 獲取最重要的行人區域和其他區域 | |
| main_pedestrian_zones = sorted(pedestrian_zones, | |
| key=lambda z: people_by_zone.get(z[0], 0), | |
| reverse=True)[:1] # 最多1個主要行人區域 | |
| top_other_zones = sorted(other_zones, | |
| key=lambda z: len(z[1].get("objects", [])), | |
| reverse=True)[:2] # 最多2個其他區域 | |
| # 合併區域 | |
| top_zones = main_pedestrian_zones + top_other_zones | |
| if not top_zones: | |
| return "" | |
| # 生成匯總描述 | |
| summary = "" | |
| max_mentioned_people = 0 # track已經提到的最大人數 | |
| # 如果總人數顯著且還沒在主描述中提到,添加總人數描述 | |
| if total_people_count > 5: | |
| summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). " | |
| max_mentioned_people = total_people_count # update已提到的最大人數 | |
| # 處理每個區域的描述,確保人數信息的一致性 | |
| processed_zones = [] | |
| for zone_name, zone_info in top_zones: | |
| zone_desc = zone_info.get("description", "a functional zone") | |
| zone_people_count = people_by_zone.get(zone_name, 0) | |
| # 檢查描述中是否包含人數資訊 | |
| contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower()) | |
| # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述 | |
| if contains_people_info and zone_people_count < max_mentioned_people: | |
| parts = zone_desc.split("with") | |
| if len(parts) > 1: | |
| # 移除人數部分 | |
| zone_desc = parts[0].strip() + " area" | |
| processed_zones.append((zone_name, {"description": zone_desc})) | |
| # 根據處理後的區域數量生成最終描述 | |
| final_desc = "" | |
| if len(processed_zones) == 1: | |
| _, zone_info = processed_zones[0] | |
| zone_desc = zone_info["description"] | |
| final_desc = summary + f"The scene includes {zone_desc}." | |
| elif len(processed_zones) == 2: | |
| _, zone1_info = processed_zones[0] | |
| _, zone2_info = processed_zones[1] | |
| zone1_desc = zone1_info["description"] | |
| zone2_desc = zone2_info["description"] | |
| final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}." | |
| else: | |
| zones_desc = ["The scene contains multiple functional areas including"] | |
| zone_descriptions = [z[1]["description"] for z in processed_zones] | |
| # 格式化最終的多區域描述 | |
| if len(zone_descriptions) == 3: | |
| formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}" | |
| else: | |
| formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}" | |
| final_desc = summary + f"{zones_desc[0]} {formatted_desc}." | |
| return self._optimize_object_description(final_desc) | |
