import os import re import json import random import numpy as np from typing import Dict, List, Tuple, Any, Optional from scene_type import SCENE_TYPES from scene_detail_templates import SCENE_DETAIL_TEMPLATES from object_template_fillers import OBJECT_TEMPLATE_FILLERS from lighting_conditions import LIGHTING_CONDITIONS from viewpoint_templates import VIEWPOINT_TEMPLATES from cultural_templates import CULTURAL_TEMPLATES from confifence_templates import CONFIDENCE_TEMPLATES class EnhancedSceneDescriber: """ Enhanced scene description generator with improved template handling, viewpoint awareness, and cultural context recognition. Provides detailed natural language descriptions of scenes based on detection results and scene classification. """ def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None): """ Initialize the enhanced scene describer. Args: templates_db: Optional custom templates database scene_types: Dictionary of scene type definitions """ # Load or use provided scene types self.scene_types = scene_types or self._load_default_scene_types() # Load templates database self.templates = templates_db or self._load_templates() # Initialize viewpoint detection parameters self._initialize_viewpoint_parameters() def _load_default_scene_types(self) -> Dict: """ Load default scene types. Returns: Dict: Scene type definitions """ return SCENE_TYPES def _load_templates(self) -> Dict: """ Load description templates from imported Python modules. Returns: Dict: Template collections for different description components """ templates = {} # 直接從導入的 Python 模組中獲取模板 templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES templates["cultural_templates"] = CULTURAL_TEMPLATES # 從 LIGHTING_CONDITIONS 獲取照明模板 templates["lighting_templates"] = { key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items() } # 設置默認的置信度模板 templates["confidence_templates"] = { "high": "{description} {details}", "medium": "This appears to be {description} {details}", "low": "This might be {description}, but the confidence is low. {details}" } # 初始化其他必要的模板(現在這個函數簡化了很多) self._initialize_default_templates(templates) return templates def _initialize_default_templates(self, templates: Dict): """ 檢查模板字典並填充任何缺失的默認模板。 在將模板移至專門的模組後,此方法主要作為安全機制, 確保即使導入失敗或某些模板未在外部定義,系統仍能正常運行。 Args: templates: 要檢查和更新的模板字典 """ # 檢查關鍵模板類型是否存在,如果不存在則添加默認值 # 置信度模板 - 用於控制描述的語氣 if "confidence_templates" not in templates: templates["confidence_templates"] = { "high": "{description} {details}", "medium": "This appears to be {description} {details}", "low": "This might be {description}, but the confidence is low. {details}" } # 場景細節模板 - 如果未從外部導入 if "scene_detail_templates" not in templates: templates["scene_detail_templates"] = { "default": ["A space with various objects."] } # 物體填充模板 - 用於生成物體描述 if "object_template_fillers" not in templates: templates["object_template_fillers"] = { "default": ["various items"] } # 視角模板 - 雖然我們現在從專門模組導入,但作為備份 if "viewpoint_templates" not in templates: # 使用簡化版的默認視角模板 templates["viewpoint_templates"] = { "eye_level": { "prefix": "From eye level, ", "observation": "the scene is viewed straight on." }, "aerial": { "prefix": "From above, ", "observation": "the scene is viewed from a bird's-eye perspective." } } # 文化模板 if "cultural_templates" not in templates: templates["cultural_templates"] = { "asian": { "elements": ["cultural elements"], "description": "The scene has Asian characteristics." }, "european": { "elements": ["architectural features"], "description": "The scene has European characteristics." } } # 照明模板 - 用於描述光照條件 if "lighting_templates" not in templates: templates["lighting_templates"] = { "day_clear": "The scene is captured during daylight.", "night": "The scene is captured at night.", "unknown": "The lighting conditions are not easily determined." } def _initialize_viewpoint_parameters(self): """ Initialize parameters used for viewpoint detection. """ self.viewpoint_params = { # Parameters for detecting aerial views "aerial_threshold": 0.7, # High object density viewed from top "aerial_size_variance_threshold": 0.15, # Low size variance in aerial views # Parameters for detecting low angle views "low_angle_threshold": 0.3, # Bottom-heavy object distribution "vertical_size_ratio_threshold": 1.8, # Vertical objects appear taller # Parameters for detecting elevated views "elevated_threshold": 0.6, # Objects mostly in middle/bottom "elevated_top_threshold": 0.3 # Few objects at top of frame } def generate_description(self, scene_type: str, detected_objects: List[Dict], confidence: float, lighting_info: Optional[Dict] = None, functional_zones: Optional[Dict] = None) -> str: """ Generate enhanced scene description based on detection results, scene type, and additional contextual information. This is the main entry point that replaces the original _generate_scene_description. Args: scene_type: Identified scene type detected_objects: List of detected objects confidence: Scene classification confidence lighting_info: Optional lighting condition information functional_zones: Optional identified functional zones Returns: str: Natural language description of the scene """ # Handle unknown scene type or very low confidence if scene_type == "unknown" or confidence < 0.4: return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info)) # Detect viewpoint viewpoint = self._detect_viewpoint(detected_objects) # Process aerial viewpoint scene types if viewpoint == "aerial": if "intersection" in scene_type or self._is_intersection(detected_objects): scene_type = "aerial_view_intersection" elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]): scene_type = "aerial_view_commercial_area" elif any(keyword in scene_type for keyword in ["plaza", "square"]): scene_type = "aerial_view_plaza" else: scene_type = "aerial_view_intersection" # Detect cultural context - only for non-aerial viewpoints cultural_context = None if viewpoint != "aerial": cultural_context = self._detect_cultural_context(scene_type, detected_objects) # Select appropriate template based on confidence if confidence > 0.75: confidence_level = "high" elif confidence > 0.5: confidence_level = "medium" else: confidence_level = "low" # Get base description for the scene type if viewpoint == "aerial": if 'base_description' not in locals(): base_description = "An aerial view showing the layout and movement patterns from above" elif scene_type in self.scene_types: base_description = self.scene_types[scene_type].get("description", "A scene") else: base_description = "A scene" # Generate detailed scene information scene_details = self._generate_scene_details( scene_type, detected_objects, lighting_info, viewpoint ) # Start with the base description description = base_description # If there's a secondary description from the scene type template, append it properly if scene_type in self.scene_types and "secondary_description" in self.scene_types[scene_type]: secondary_desc = self.scene_types[scene_type]["secondary_description"] if secondary_desc: description = self._smart_append(description, secondary_desc) # Improve description based on people count people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # Person class if people_objs: people_count = len(people_objs) if people_count > 5: people_phrase = f"numerous people ({people_count})" else: people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}" # Add people information to the scene details if not already mentioned if "people" not in description.lower() and "pedestrian" not in description.lower(): description = self._smart_append(description, f"The scene includes {people_phrase}") # Apply cultural context if detected (only for non-aerial viewpoints) if cultural_context and viewpoint != "aerial": cultural_elements = self._generate_cultural_elements(cultural_context) if cultural_elements: description = self._smart_append(description, cultural_elements) # Now append the detailed scene information if available if scene_details: # Use smart_append to ensure proper formatting between base description and details description = self._smart_append(description, scene_details) # Include lighting information if available lighting_description = "" if lighting_info and "time_of_day" in lighting_info: lighting_type = lighting_info["time_of_day"] if lighting_type in self.templates.get("lighting_templates", {}): lighting_description = self.templates["lighting_templates"][lighting_type] # Add lighting description if available if lighting_description and lighting_description not in description: description = self._smart_append(description, lighting_description) # Process viewpoint information if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}): viewpoint_template = self.templates["viewpoint_templates"][viewpoint] # Special handling for viewpoint prefix prefix = viewpoint_template.get('prefix', '') if prefix and not description.startswith(prefix): # Prefix is a phrase like "From above, " that should precede the description if description and description[0].isupper(): # Maintain the flow by lowercasing the first letter after the prefix description = prefix + description[0].lower() + description[1:] else: description = prefix + description # Get appropriate scene elements description based on viewpoint if viewpoint == "aerial": scene_elements = "the crossing patterns and pedestrian movement" else: scene_elements = "objects and layout" viewpoint_desc = viewpoint_template.get("observation", "").format( scene_elements=scene_elements ) # Add viewpoint observation if not already included if viewpoint_desc and viewpoint_desc not in description: description = self._smart_append(description, viewpoint_desc) # Add information about functional zones if available if functional_zones and len(functional_zones) > 0: zones_desc = self._describe_functional_zones(functional_zones) if zones_desc: description = self._smart_append(description, zones_desc) # Calculate actual people count people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) # Check for inconsistencies in people count descriptions if people_count > 5: # Identify fragments that might contain smaller people counts small_people_patterns = [ r"Area with \d+ people\.", r"Area with \d+ person\.", r"with \d+ people", r"with \d+ person" ] # Check and remove each pattern filtered_description = description for pattern in small_people_patterns: matches = re.findall(pattern, filtered_description) for match in matches: # Extract the number from the match number_match = re.search(r'\d+', match) if number_match: try: people_mentioned = int(number_match.group()) # If the mentioned count is less than total, remove the entire sentence if people_mentioned < people_count: # Split description into sentences sentences = re.split(r'(?<=[.!?])\s+', filtered_description) # Remove sentences containing the match filtered_sentences = [] for sentence in sentences: if match not in sentence: filtered_sentences.append(sentence) # Recombine the description filtered_description = " ".join(filtered_sentences) except ValueError: # Failed number conversion, continue processing continue # Use the filtered description description = filtered_description # Final formatting to ensure correct punctuation and capitalization description = self._format_final_description(description) return description def _smart_append(self, current_text: str, new_fragment: str) -> str: """ Intelligently append a new text fragment to the current text, handling punctuation and capitalization correctly. Args: current_text: The existing text to append to new_fragment: The new text fragment to append Returns: str: The combined text with proper formatting """ # Handle empty cases if not new_fragment: return current_text if not current_text: # Ensure first character is uppercase for the first fragment return new_fragment[0].upper() + new_fragment[1:] if new_fragment else "" # Clean up existing text current_text = current_text.rstrip() # Check for ending punctuation ends_with_sentence = current_text.endswith(('.', '!', '?')) ends_with_comma = current_text.endswith(',') # Specifically handle the "A xxx A yyy" pattern that's causing issues if (current_text.startswith("A ") or current_text.startswith("An ")) and \ (new_fragment.startswith("A ") or new_fragment.startswith("An ")): return current_text + ". " + new_fragment # Decide how to join the texts if ends_with_sentence: # After a sentence, start with uppercase and add proper spacing joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:]) elif ends_with_comma: # After a comma, maintain flow with lowercase unless it's a proper noun or special case if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper(): joined_text = current_text + " " + new_fragment else: joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:] elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower(): # When adding a new sentence about the scene, use a period joined_text = current_text + ". " + new_fragment else: # For other cases, decide based on the content if self._is_related_phrases(current_text, new_fragment): if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper(): joined_text = current_text + ", " + new_fragment else: joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:] else: # Use period for unrelated phrases joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:]) return joined_text def _is_related_phrases(self, text1: str, text2: str) -> bool: """ Determine if two phrases are related and should be connected with a comma rather than separated with a period. Args: text1: The first text fragment text2: The second text fragment to be appended Returns: bool: Whether the phrases appear to be related """ # Check if either phrase starts with "A" or "An" - these are likely separate descriptions if (text1.startswith("A ") or text1.startswith("An ")) and \ (text2.startswith("A ") or text2.startswith("An ")): return False # These are separate descriptions, not related phrases # Check if the second phrase starts with a connecting word connecting_words = ["which", "where", "who", "whom", "whose", "with", "without", "this", "these", "that", "those", "and", "or", "but"] first_word = text2.split()[0].lower() if text2 else "" if first_word in connecting_words: return True # Check if the first phrase ends with something that suggests continuity ending_patterns = ["such as", "including", "like", "especially", "particularly", "for example", "for instance", "namely", "specifically"] for pattern in ending_patterns: if text1.lower().endswith(pattern): return True # Check if both phrases are about the scene if "scene" in text1.lower() and "scene" in text2.lower(): return False # Separate statements about the scene should be separate sentences return False def _format_final_description(self, text: str) -> str: """ Format the final description text to ensure correct punctuation, capitalization, and spacing. Args: text: The text to format Returns: str: The properly formatted text """ import re if not text: return "" # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題) text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE) text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE) # 2. 確保第一個字母大寫 text = text[0].upper() + text[1:] if text else "" # 3. 修正詞之間的空格問題 text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個 text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格 # 4. 修正詞連接問題 text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格 text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格 text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題 # 5. 修正標點符號後的大小寫問題 text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫 # 6. 修正逗號後接大寫單詞的問題 def fix_capitalization_after_comma(match): word = match.group(2) # 例外情況:保留專有名詞、人稱代詞等的大寫 if word in ["I", "I'm", "I've", "I'd", "I'll"]: return match.group(0) # 保持原樣 # 保留月份、星期、地名等專有名詞的大寫 proper_nouns = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] if word in proper_nouns: return match.group(0) # 保持原樣 # 其他情況:將首字母改為小寫 return match.group(1) + word[0].lower() + word[1:] # 匹配逗號後接空格再接大寫單詞的模式 text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text) common_phrases = [ (r'Social or seating area', r'social or seating area'), (r'Sleeping area', r'sleeping area'), (r'Dining area', r'dining area'), (r'Living space', r'living space') ] for phrase, replacement in common_phrases: # 只修改句中的術語,保留句首的大寫 text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text) # 修改句中的術語,但保留句首的大寫 text = re.sub(r'(?<=,\s)' + phrase, replacement, text) # 7. 確保標點符號後有空格 text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格 text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格 # 8. 修正重複標點符號 text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個 text = re.sub(r',{2,}', ',', text) # 多個逗號變一個 # 9. 確保文本以標點結束 if text and not text[-1] in '.!?': text += '.' return text def _is_intersection(self, detected_objects: List[Dict]) -> bool: """ 通過分析物體分佈來判斷場景是否為十字路口 """ # 檢查行人分佈模式 pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0] if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口 # 抓取行人位置 positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians] # 分析 x 和 y 坐標分佈 x_coords = [pos[0] for pos in positions] y_coords = [pos[1] for pos in positions] # 計算 x 和 y 坐標的變異數 x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 # 計算範圍 x_range = max(x_coords) - min(x_coords) y_range = max(y_coords) - min(y_coords) # 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口 if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: return True return False def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str: """ Generate a generic description when scene type is unknown or confidence is very low. Args: detected_objects: List of detected objects lighting_info: Optional lighting condition information Returns: str: Generic description based on detected objects """ # Count object occurrences obj_counts = {} for obj in detected_objects: class_name = obj["class_name"] if class_name not in obj_counts: obj_counts[class_name] = 0 obj_counts[class_name] += 1 # Get top objects by count top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5] if not top_objects: base_desc = "No clearly identifiable objects are visible in this scene." else: # Format object list objects_text = [] for name, count in top_objects: if count > 1: objects_text.append(f"{count} {name}s") else: objects_text.append(name) if len(objects_text) == 1: objects_list = objects_text[0] elif len(objects_text) == 2: objects_list = f"{objects_text[0]} and {objects_text[1]}" else: objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}" base_desc = f"This scene contains {objects_list}." # Add lighting information if available if lighting_info and "time_of_day" in lighting_info: lighting_type = lighting_info["time_of_day"] if lighting_type in self.templates.get("lighting_templates", {}): lighting_desc = self.templates["lighting_templates"][lighting_type] base_desc += f" {lighting_desc}" return base_desc def _generate_scene_details(self, scene_type: str, detected_objects: List[Dict], lighting_info: Optional[Dict] = None, viewpoint: str = "eye_level") -> str: """ Generate detailed description based on scene type and detected objects. Args: scene_type: Identified scene type detected_objects: List of detected objects lighting_info: Optional lighting condition information viewpoint: Detected viewpoint (aerial, eye_level, etc.) Returns: str: Detailed scene description """ # Get scene-specific templates scene_details = "" scene_templates = self.templates.get("scene_detail_templates", {}) # Handle specific scene types if scene_type in scene_templates: # Select a template appropriate for the viewpoint if available viewpoint_key = f"{scene_type}_{viewpoint}" if viewpoint_key in scene_templates: # We have a viewpoint-specific template templates_list = scene_templates[viewpoint_key] else: # Fall back to general templates for this scene type templates_list = scene_templates[scene_type] # Select a random template from the list if templates_list: detail_template = random.choice(templates_list) # Fill the template with object information scene_details = self._fill_detail_template( detail_template, detected_objects, scene_type ) else: # Use default templates if specific ones aren't available if "default" in scene_templates: detail_template = random.choice(scene_templates["default"]) scene_details = self._fill_detail_template( detail_template, detected_objects, "default" ) else: # Fall back to basic description if no templates are available scene_details = self._generate_basic_details(scene_type, detected_objects) return scene_details def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str: """ Fill a template with specific details based on detected objects. Args: template: Template string with placeholders detected_objects: List of detected objects scene_type: Identified scene type Returns: str: Filled template """ # Find placeholders in the template using simple {placeholder} syntax import re placeholders = re.findall(r'\{([^}]+)\}', template) filled_template = template # Get object template fillers fillers = self.templates.get("object_template_fillers", {}) # 為所有可能的變數設置默認值 default_replacements = { # 室內相關 "furniture": "various furniture pieces", "seating": "comfortable seating", "electronics": "entertainment devices", "bed_type": "a bed", "bed_location": "room", "bed_description": "sleeping arrangements", "extras": "personal items", "table_setup": "a dining table and chairs", "table_description": "a dining surface", "dining_items": "dining furniture and tableware", "appliances": "kitchen appliances", "kitchen_items": "cooking utensils and dishware", "cooking_equipment": "cooking equipment", "office_equipment": "work-related furniture and devices", "desk_setup": "a desk and chair", "computer_equipment": "electronic devices", # 室外/城市相關 "traffic_description": "vehicles and pedestrians", "people_and_vehicles": "people and various vehicles", "street_elements": "urban infrastructure", "park_features": "benches and greenery", "outdoor_elements": "natural features", "park_description": "outdoor amenities", "store_elements": "merchandise displays", "shopping_activity": "customers browse and shop", "store_items": "products for sale", # 高級餐廳相關 "design_elements": "elegant decor", "lighting": "stylish lighting fixtures", # 亞洲商業街相關 "storefront_features": "compact shops", "pedestrian_flow": "people walking", "asian_elements": "distinctive cultural elements", "cultural_elements": "traditional design features", "signage": "colorful signs", "street_activities": "busy urban activity", # 金融區相關 "buildings": "tall buildings", "traffic_elements": "vehicles", "skyscrapers": "high-rise buildings", "road_features": "wide streets", "architectural_elements": "modern architecture", "city_landmarks": "prominent structures", # 十字路口相關 "crossing_pattern": "marked pedestrian crossings", "pedestrian_behavior": "careful walking", "pedestrian_density": "groups of pedestrians", "traffic_pattern": "regulated traffic flow", # 交通樞紐相關 "transit_vehicles": "public transportation vehicles", "passenger_activity": "commuter movement", "transportation_modes": "various transit options", "passenger_needs": "waiting areas", "transit_infrastructure": "transit facilities", "passenger_movement": "commuter flow", # 購物區相關 "retail_elements": "shops and displays", "store_types": "various retail establishments", "walkway_features": "pedestrian pathways", "commercial_signage": "store signs", "consumer_behavior": "shopping activities", # 空中視角相關 "commercial_layout": "organized retail areas", "pedestrian_pattern": "people movement patterns", "gathering_features": "public gathering spaces", "movement_pattern": "crowd flow patterns", "urban_elements": "city infrastructure", "public_activity": "social interaction", # 文化特定元素 "stall_elements": "vendor booths", "lighting_features": "decorative lights", "food_elements": "food offerings", "vendor_stalls": "market stalls", "nighttime_activity": "evening commerce", "cultural_lighting": "traditional lighting", "night_market_sounds": "lively market sounds", "evening_crowd_behavior": "nighttime social activity", "architectural_elements": "cultural buildings", "religious_structures": "sacred buildings", "decorative_features": "ornamental designs", "cultural_practices": "traditional activities", "temple_architecture": "religious structures", "sensory_elements": "atmospheric elements", "visitor_activities": "cultural experiences", "ritual_activities": "ceremonial practices", "cultural_symbols": "meaningful symbols", "architectural_style": "historical buildings", "historic_elements": "traditional architecture", "urban_design": "city planning elements", "social_behaviors": "public interactions", "european_features": "European architectural details", "tourist_activities": "visitor activities", "local_customs": "regional practices", # 時間特定元素 "lighting_effects": "artificial lighting", "shadow_patterns": "light and shadow", "urban_features": "city elements", "illuminated_elements": "lit structures", "evening_activities": "nighttime activities", "light_sources": "lighting points", "lit_areas": "illuminated spaces", "shadowed_zones": "darker areas", "illuminated_signage": "bright signs", "colorful_lighting": "multicolored lights", "neon_elements": "neon signs", "night_crowd_behavior": "evening social patterns", "light_displays": "lighting installations", "building_features": "architectural elements", "nightlife_activities": "evening entertainment", "lighting_modifier": "bright", # 混合環境元素 "transitional_elements": "connecting features", "indoor_features": "interior elements", "outdoor_setting": "exterior spaces", "interior_amenities": "inside comforts", "exterior_features": "outside elements", "inside_elements": "interior design", "outside_spaces": "outdoor areas", "dual_environment_benefits": "combined settings", "passenger_activities": "waiting behaviors", "transportation_types": "transit vehicles", "sheltered_elements": "covered areas", "exposed_areas": "open sections", "waiting_behaviors": "passenger activities", "indoor_facilities": "inside services", "platform_features": "transit platform elements", "transit_routines": "transportation procedures", # 專門場所元素 "seating_arrangement": "spectator seating", "playing_surface": "athletic field", "sporting_activities": "sports events", "spectator_facilities": "viewer accommodations", "competition_space": "sports arena", "sports_events": "athletic competitions", "viewing_areas": "audience sections", "field_elements": "field markings and equipment", "game_activities": "competitive play", "construction_equipment": "building machinery", "building_materials": "construction supplies", "construction_activities": "building work", "work_elements": "construction tools", "structural_components": "building structures", "site_equipment": "construction gear", "raw_materials": "building supplies", "construction_process": "building phases", "medical_elements": "healthcare equipment", "clinical_activities": "medical procedures", "facility_design": "healthcare layout", "healthcare_features": "medical facilities", "patient_interactions": "care activities", "equipment_types": "medical devices", "care_procedures": "health services", "treatment_spaces": "clinical areas", "educational_furniture": "learning furniture", "learning_activities": "educational practices", "instructional_design": "teaching layout", "classroom_elements": "school equipment", "teaching_methods": "educational approaches", "student_engagement": "learning participation", "learning_spaces": "educational areas", "educational_tools": "teaching resources", "knowledge_transfer": "learning exchanges" } # For each placeholder, try to fill with appropriate content for placeholder in placeholders: if placeholder in fillers: # Get random filler for this placeholder options = fillers[placeholder] if options: # Select 1-3 items from the options list num_items = min(len(options), random.randint(1, 3)) selected_items = random.sample(options, num_items) # Create a formatted list if len(selected_items) == 1: replacement = selected_items[0] elif len(selected_items) == 2: replacement = f"{selected_items[0]} and {selected_items[1]}" else: replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}" # Replace the placeholder filled_template = filled_template.replace(f"{{{placeholder}}}", replacement) else: # Try to fill with scene-specific logic replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type) if replacement: filled_template = filled_template.replace(f"{{{placeholder}}}", replacement) elif placeholder in default_replacements: # Use default replacement if available filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder]) else: # Last resort default filled_template = filled_template.replace(f"{{{placeholder}}}", "various items") return filled_template def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str: """ Generate content for a template placeholder based on scene-specific logic. Args: placeholder: Template placeholder detected_objects: List of detected objects scene_type: Identified scene type Returns: str: Content for the placeholder """ # Handle different types of placeholders with custom logic if placeholder == "furniture": # Extract furniture items furniture_ids = [56, 57, 58, 59, 60, 61] # Example furniture IDs furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids] if furniture_objects: furniture_names = [obj["class_name"] for obj in furniture_objects[:3]] return ", ".join(set(furniture_names)) return "various furniture items" elif placeholder == "electronics": # Extract electronic items electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # Example electronics IDs electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids] if electronics_objects: electronics_names = [obj["class_name"] for obj in electronics_objects[:3]] return ", ".join(set(electronics_names)) return "electronic devices" elif placeholder == "people_count": # Count people people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) if people_count == 0: return "no people" elif people_count == 1: return "one person" elif people_count < 5: return f"{people_count} people" else: return "several people" elif placeholder == "seating": # Extract seating items seating_ids = [56, 57] # chair, sofa seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids] if seating_objects: seating_names = [obj["class_name"] for obj in seating_objects[:2]] return ", ".join(set(seating_names)) return "seating arrangements" # Default case - empty string return "" def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str: """ Generate basic details when templates aren't available. Args: scene_type: Identified scene type detected_objects: List of detected objects Returns: str: Basic scene details """ # Handle specific scene types with custom logic if scene_type == "living_room": tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62] # TV sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57] # Sofa if tv_objs and sofa_objs: tv_region = tv_objs[0]["region"] sofa_region = sofa_objs[0]["region"] arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, " arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. " return f"{arrangement}This appears to be a space designed for relaxation and entertainment." elif scene_type == "bedroom": bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed if bed_objs: bed_region = bed_objs[0]["region"] extra_items = [] for obj in detected_objects: if obj["class_id"] == 74: # Clock extra_items.append("clock") elif obj["class_id"] == 73: # Book extra_items.append("book") extras = "" if extra_items: extras = f" There is also a {' and a '.join(extra_items)} visible." return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}" elif scene_type in ["dining_area", "kitchen"]: # Count food and dining-related items food_items = [] for obj in detected_objects: if obj["class_id"] in [39, 41, 42, 43, 44, 45]: # Kitchen items food_items.append(obj["class_name"]) food_str = "" if food_items: unique_items = list(set(food_items)) if len(unique_items) <= 3: food_str = f" with {', '.join(unique_items)}" else: food_str = f" with {', '.join(unique_items[:3])} and other items" return f"{food_str}." elif scene_type == "city_street": # Count people and vehicles people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]]) # Bicycle, car, motorbike, bus, truck traffic_desc = "" if people_count > 0 and vehicle_count > 0: traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and " traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" elif people_count > 0: traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}" elif vehicle_count > 0: traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" return f"{traffic_desc}." # Handle more specialized scenes elif scene_type == "asian_commercial_street": # Look for key urban elements people_count = len([obj for obj in detected_objects if obj["class_id"] == 0]) vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]]) # Analyze pedestrian distribution people_positions = [] for obj in detected_objects: if obj["class_id"] == 0: # Person people_positions.append(obj["normalized_center"]) # Check if people are distributed along a line (indicating a walking path) structured_path = False if len(people_positions) >= 3: # Simplified check - see if y-coordinates are similar for multiple people y_coords = [pos[1] for pos in people_positions] y_mean = sum(y_coords) / len(y_coords) y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords) if y_variance < 0.05: # Low variance indicates linear arrangement structured_path = True street_desc = "A commercial street with " if people_count > 0: street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}" if vehicle_count > 0: street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" elif vehicle_count > 0: street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}" else: street_desc += "various commercial elements" if structured_path: street_desc += ". The pedestrians appear to be following a defined walking path" # Add cultural elements street_desc += ". The signage and architectural elements suggest an Asian urban setting." return street_desc # Default general description return "The scene contains various elements characteristic of this environment." def _detect_viewpoint(self, detected_objects: List[Dict]) -> str: """ 改進視角檢測,特別加強對空中俯視視角的識別。 Args: detected_objects: 檢測到的物體列表 Returns: str: 檢測到的視角類型 """ if not detected_objects: return "eye_level" # default # 提取物體位置和大小 top_region_count = 0 bottom_region_count = 0 total_objects = len(detected_objects) # 追蹤大小分布以檢測空中視角 sizes = [] # 垂直大小比例用於低角度檢測 height_width_ratios = [] # 用於檢測規則圖案的變數 people_positions = [] crosswalk_pattern_detected = False for obj in detected_objects: # 計算頂部/底部區域中的物體 region = obj["region"] if "top" in region: top_region_count += 1 elif "bottom" in region: bottom_region_count += 1 # 計算標準化大小(面積) if "normalized_area" in obj: sizes.append(obj["normalized_area"]) # 計算高度/寬度比例 if "normalized_size" in obj: width, height = obj["normalized_size"] if width > 0: height_width_ratios.append(height / width) # 收集人的位置用於圖案檢測 if obj["class_id"] == 0: # 人 if "normalized_center" in obj: people_positions.append(obj["normalized_center"]) # 專門為斑馬線十字路口添加檢測邏輯 # 檢查是否有明顯的垂直和水平行人分布 people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人 if len(people_objs) >= 8: # 需要足夠多的人才能形成十字路口模式 # 檢查是否有斑馬線模式 - 新增功能 if len(people_positions) >= 4: # 對位置進行聚類分析,尋找線性分布 x_coords = [pos[0] for pos in people_positions] y_coords = [pos[1] for pos in people_positions] # 計算 x 和 y 坐標的變異數和範圍 x_variance = np.var(x_coords) if len(x_coords) > 1 else 0 y_variance = np.var(y_coords) if len(y_coords) > 1 else 0 x_range = max(x_coords) - min(x_coords) y_range = max(y_coords) - min(y_coords) # 嘗試檢測十字形分布 # 如果 x 和 y 方向都有較大範圍,且範圍相似,可能是十字路口 if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3: # 計算到中心點的距離 center_x = np.mean(x_coords) center_y = np.mean(y_coords) # 將點映射到十字架的軸上(水平和垂直) x_axis_distance = [abs(x - center_x) for x in x_coords] y_axis_distance = [abs(y - center_y) for y in y_coords] # 點應該接近軸線(水平或垂直) # 對於每個點,檢查它是否接近水平或垂直軸線 close_to_axis_count = 0 for i in range(len(x_coords)): if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1: close_to_axis_count += 1 # 如果足夠多的點接近軸線,認為是十字路口 if close_to_axis_count >= len(x_coords) * 0.6: crosswalk_pattern_detected = True # 如果沒有檢測到十字形,嘗試檢測線性聚類分布 if not crosswalk_pattern_detected: # 檢查 x 和 y 方向的聚類 x_clusters = self._detect_linear_clusters(x_coords) y_clusters = self._detect_linear_clusters(y_coords) # 如果在 x 和 y 方向上都有多個聚類,可能是交叉的斑馬線 if len(x_clusters) >= 2 and len(y_clusters) >= 2: crosswalk_pattern_detected = True # 檢測斑馬線模式 - 優先判斷 if crosswalk_pattern_detected: return "aerial" # 檢測行人分布情況 if len(people_objs) >= 10: people_region_counts = {} for obj in people_objs: region = obj["region"] if region not in people_region_counts: people_region_counts[region] = 0 people_region_counts[region] += 1 # 計算不同區域中的行人數量 region_count = len([r for r, c in people_region_counts.items() if c >= 2]) # 如果行人分布在多個區域中,可能是空中視角 if region_count >= 4: # 檢查行人分布的模式 # 特別是檢查不同區域中行人數量的差異 region_counts = list(people_region_counts.values()) region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0 region_counts_mean = np.mean(region_counts) if region_counts else 0 # 如果行人分布較為均勻(變異係數小),可能是空中視角 if region_counts_mean > 0: variation_coefficient = region_counts_variance / region_counts_mean if variation_coefficient < 0.5: return "aerial" # 計算指標 top_ratio = top_region_count / total_objects if total_objects > 0 else 0 bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0 # 大小變異數(標準化) size_variance = 0 if sizes: mean_size = sum(sizes) / len(sizes) size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes) size_variance = size_variance / (mean_size ** 2) # 標準化 # 平均高度/寬度比例 avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0 # 空中視角:低大小差異,物體均勻分布,底部很少或沒有物體 if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]): return "aerial" # 低角度視角:物體傾向於比寬高,頂部較多物體 elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and top_ratio > self.viewpoint_params["low_angle_threshold"]): return "low_angle" # 高視角:底部較多物體,頂部較少 elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and top_ratio < self.viewpoint_params["elevated_top_threshold"]): return "elevated" # 默認:平視角 return "eye_level" def _detect_linear_clusters(self, coords, threshold=0.05): """ 檢測坐標中的線性聚類 Args: coords: 一維坐標列表 threshold: 聚類閾值 Returns: list: 聚類列表 """ if not coords: return [] # 排序坐標 sorted_coords = sorted(coords) clusters = [] current_cluster = [sorted_coords[0]] for i in range(1, len(sorted_coords)): # 如果當前坐標與前一個接近,添加到當前聚類 if sorted_coords[i] - sorted_coords[i-1] < threshold: current_cluster.append(sorted_coords[i]) else: # 否則開始新的聚類 if len(current_cluster) >= 2: # 至少需要2個點形成聚類 clusters.append(current_cluster) current_cluster = [sorted_coords[i]] # 添加最後一個cluster if len(current_cluster) >= 2: clusters.append(current_cluster) return clusters def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]: """ Detect the likely cultural context of the scene. Args: scene_type: Identified scene type detected_objects: List of detected objects Returns: Optional[str]: Detected cultural context (asian, european, etc.) or None """ # Scene types with explicit cultural contexts cultural_scene_mapping = { "asian_commercial_street": "asian", "asian_night_market": "asian", "asian_temple_area": "asian", "european_plaza": "european" } # Check if scene type directly indicates cultural context if scene_type in cultural_scene_mapping: return cultural_scene_mapping[scene_type] # No specific cultural context detected return None def _generate_cultural_elements(self, cultural_context: str) -> str: """ Generate description of cultural elements for the detected context. Args: cultural_context: Detected cultural context Returns: str: Description of cultural elements """ # Get template for this cultural context cultural_templates = self.templates.get("cultural_templates", {}) if cultural_context in cultural_templates: template = cultural_templates[cultural_context] elements = template.get("elements", []) if elements: # Select 1-2 random elements num_elements = min(len(elements), random.randint(1, 2)) selected_elements = random.sample(elements, num_elements) # Format elements list elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0] # Fill template return template.get("description", "").format(elements=elements_text) return "" def _optimize_object_description(self, description: str) -> str: """ 優化物品描述,避免重複列舉相同物品 """ import re # 處理床鋪重複描述 if "bed in the room" in description: description = description.replace("a bed in the room", "a bed") # 處理重複的物品列表 # 尋找格式如 "item, item, item" 的模式 object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description) for obj_list in object_lists: # 計算每個物品出現次數 items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list) item_counts = {} for item in items: item = item.strip() if item and item not in ["and", "with"]: if item not in item_counts: item_counts[item] = 0 item_counts[item] += 1 # 生成優化後的物品列表 if item_counts: new_items = [] for item, count in item_counts.items(): if count > 1: new_items.append(f"{count} {item}s") else: new_items.append(item) # 格式化新列表 if len(new_items) == 1: new_list = new_items[0] elif len(new_items) == 2: new_list = f"{new_items[0]} and {new_items[1]}" else: new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}" # 替換原始列表 description = description.replace(obj_list, new_list) return description def _describe_functional_zones(self, functional_zones: Dict) -> str: """ 生成場景功能區域的描述,優化處理行人區域、人數統計和物品重複問題。 Args: functional_zones: 識別出的功能區域字典 Returns: str: 功能區域描述 """ if not functional_zones: return "" # 計算場景中的總人數 total_people_count = 0 people_by_zone = {} # 計算每個區域的人數並累計總人數 for zone_name, zone_info in functional_zones.items(): if "objects" in zone_info: zone_people_count = zone_info["objects"].count("person") people_by_zone[zone_name] = zone_people_count total_people_count += zone_people_count # 分類區域為行人區域和其他區域 pedestrian_zones = [] other_zones = [] for zone_name, zone_info in functional_zones.items(): # 檢查是否是行人相關區域 if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]): pedestrian_zones.append((zone_name, zone_info)) else: other_zones.append((zone_name, zone_info)) # 獲取最重要的行人區域和其他區域 main_pedestrian_zones = sorted(pedestrian_zones, key=lambda z: people_by_zone.get(z[0], 0), reverse=True)[:1] # 最多1個主要行人區域 top_other_zones = sorted(other_zones, key=lambda z: len(z[1].get("objects", [])), reverse=True)[:2] # 最多2個其他區域 # 合併區域 top_zones = main_pedestrian_zones + top_other_zones if not top_zones: return "" # 生成匯總描述 summary = "" max_mentioned_people = 0 # 跟踪已經提到的最大人數 # 如果總人數顯著且還沒在主描述中提到,添加總人數描述 if total_people_count > 5: summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). " max_mentioned_people = total_people_count # 更新已提到的最大人數 # 處理每個區域的描述,確保人數信息的一致性 processed_zones = [] for zone_name, zone_info in top_zones: zone_desc = zone_info.get("description", "a functional zone") zone_people_count = people_by_zone.get(zone_name, 0) # 檢查描述中是否包含人數信息 contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower()) # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述 if contains_people_info and zone_people_count < max_mentioned_people: parts = zone_desc.split("with") if len(parts) > 1: # 移除人數部分 zone_desc = parts[0].strip() + " area" processed_zones.append((zone_name, {"description": zone_desc})) # 根據處理後的區域數量生成最終描述 final_desc = "" if len(processed_zones) == 1: _, zone_info = processed_zones[0] zone_desc = zone_info["description"] final_desc = summary + f"The scene includes {zone_desc}." elif len(processed_zones) == 2: _, zone1_info = processed_zones[0] _, zone2_info = processed_zones[1] zone1_desc = zone1_info["description"] zone2_desc = zone2_info["description"] final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}." else: zones_desc = ["The scene contains multiple functional areas including"] zone_descriptions = [z[1]["description"] for z in processed_zones] # 格式化最終的多區域描述 if len(zone_descriptions) == 3: formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}" else: formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}" final_desc = summary + f"{zones_desc[0]} {formatted_desc}." return self._optimize_object_description(final_desc)