Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on May 3

Commit

3172319

verified ·

1 Parent(s): de894d3

Upload 27 files

Browse files

Files changed (27) hide show

.gitattributes +2 -0
activity_templates.py +324 -0
app.py +186 -53
clip_analyzer.py +389 -0
clip_prompts.py +265 -0
color_mapper.py +47 -46
confifence_templates.py +6 -0
cultural_templates.py +19 -0
detection_model.py +26 -26
enhance_scene_describer.py +1314 -0
image_processor.py +140 -64
lighting_analyzer.py +811 -0
lighting_conditions.py +131 -0
object_categories.py +8 -0
object_template_fillers.py +78 -0
requirements.txt +1 -0
room_02.jpg +3 -0
safety_templates.py +5 -0
scene_analyzer.py +408 -0
scene_description.py +401 -0
scene_detail_templates.py +203 -0
scene_type.py +394 -0
spatial_analyzer.py +1444 -0
street_04.jpg +3 -0
style.py +185 -88
viewpoint_templates.py +19 -0
visualization_helper.py +1 -1

.gitattributes CHANGED Viewed

@@ -37,3 +37,5 @@ room_01.jpg filter=lfs diff=lfs merge=lfs -text
 street_01.jpg filter=lfs diff=lfs merge=lfs -text
 street_02.jpg filter=lfs diff=lfs merge=lfs -text
 street_03.jpg filter=lfs diff=lfs merge=lfs -text

 street_01.jpg filter=lfs diff=lfs merge=lfs -text
 street_02.jpg filter=lfs diff=lfs merge=lfs -text
 street_03.jpg filter=lfs diff=lfs merge=lfs -text
+room_02.jpg filter=lfs diff=lfs merge=lfs -text
+street_04.jpg filter=lfs diff=lfs merge=lfs -text

activity_templates.py ADDED Viewed

	@@ -0,0 +1,324 @@

+ACTIVITY_TEMPLATES = {
+            "living_room": [
+                "Watching TV",
+                "Relaxing on the sofa",
+                "Reading",
+                "Socializing"
+            ],
+            "bedroom": [
+                "Sleeping",
+                "Resting",
+                "Getting dressed",
+                "Reading in bed"
+            ],
+            "dining_area": [
+                "Eating a meal",
+                "Having a conversation",
+                "Working at table"
+            ],
+            "kitchen": [
+                "Cooking",
+                "Food preparation",
+                "Cleaning dishes"
+            ],
+            "office_workspace": [
+                "Working on computer",
+                "Office work",
+                "Virtual meetings",
+                "Reading documents"
+            ],
+            "meeting_room": [
+                "Group meeting",
+                "Presentation",
+                "Team discussion",
+                "Collaboration"
+            ],
+            "city_street": [
+                "Walking",
+                "Commuting",
+                "Shopping",
+                "Waiting for transportation"
+            ],
+            "parking_lot": [
+                "Parking vehicles",
+                "Loading/unloading items",
+                "Entering/exiting vehicles"
+            ],
+            "park_area": [
+                "Walking",
+                "Relaxing outdoors",
+                "Exercising",
+                "Social gathering"
+            ],
+            "retail_store": [
+                "Shopping",
+                "Browsing products",
+                "Purchasing items"
+            ],
+            "supermarket": [
+                "Grocery shopping",
+                "Selecting products",
+                "Checking out"
+            ],
+            "upscale_dining": [
+                "Fine dining",
+                "Social gathering",
+                "Special occasion meal",
+                "Family dinner",
+                "Business meeting",
+                "Celebratory meal"
+            ],
+            "asian_commercial_street": [
+                "Shopping",
+                "Sightseeing",
+                "Walking to destinations",
+                "Visiting local shops",
+                "Cultural exploration",
+                "Urban commuting",
+                "Meeting friends"
+            ],
+            "financial_district": [
+                "Commuting",
+                "Business travel",
+                "Urban transit",
+                "Sightseeing",
+                "City navigation",
+                "Professional activities",
+                "Corporate meetings"
+            ],
+            "urban_intersection": [
+                "Street crossing",
+                "Waiting for signals",
+                "Urban navigation",
+                "Commuting",
+                "Group movement",
+                "Following traffic patterns",
+                "Pedestrian coordination"
+            ],
+            "transit_hub": [
+                "Commuting",
+                "Waiting for transportation",
+                "Transferring between vehicles",
+                "Starting/ending journeys",
+                "Meeting travelers",
+                "Checking transit schedules",
+                "Urban transportation"
+            ],
+            "shopping_district": [
+                "Retail shopping",
+                "Window browsing",
+                "Social shopping",
+                "Product comparison",
+                "Making purchases",
+                "Brand exploration",
+                "Recreational shopping"
+            ],
+            "bus_stop": [
+                "Waiting for the bus",
+                "Checking schedules",
+                "Boarding or alighting",
+                "Standing under shelter"
+            ],
+            "bus_station": [
+                "Navigating between platforms",
+                "Handling luggage",
+                "Boarding buses",
+                "Gathering at waiting areas"
+            ],
+            "zoo": [
+                "Watching animal exhibits",
+                "Taking photos of wildlife",
+                "Walking along enclosures",
+                "Reading informational signs"
+            ],
+            "harbor": [
+                "Observing docked boats",
+                "Commuting by watercraft",
+                "Loading or unloading cargo",
+                "Strolling along the pier"
+            ],
+            "playground": [
+                "Playing ball games",
+                "Swinging or sliding",
+                "Running around",
+                "Socializing with friends"
+            ],
+            "sports_field": [
+                "Practicing ball drills",
+                "Competing in matches",
+                "Warming up or stretching",
+                "Team training sessions"
+            ],
+            "narrow_commercial_alley": [
+                "Walking through alley",
+                "Browsing storefronts",
+                "Navigating light traffic",
+                "Carrying shopping bags"
+            ],
+            "daytime_shopping_street": [
+                "Shopping",
+                "Window browsing",
+                "Street photography",
+                "Commuting by vehicle"
+            ],
+            "urban_pedestrian_crossing": [
+                "Crossing the street",
+                "Waiting for signal",
+                "Following traffic rules",
+                "Checking for vehicles"
+            ],
+            "aerial_view_intersection": [
+                "Crossing multiple directions",
+                "Following traffic signals",
+                "Navigating pedestrian paths",
+                "Traffic management",
+                "Multi-directional movement",
+                "Organized crossing patterns",
+                "Waiting at signals"
+            ],
+            "aerial_view_commercial_area": [
+                "Shopping district navigation",
+                "Retail browsing",
+                "Store-to-store movement",
+                "Commercial zone foot traffic",
+                "Shopping center traversal",
+                "Retail area engagement",
+                "Walking between stores"
+            ],
+            "aerial_view_plaza": [
+                "Public gathering",
+                "Open space traversal",
+                "Community congregation",
+                "Plaza navigation",
+                "Public square activities",
+                "Urban space utilization"
+            ],
+            "asian_night_market": [
+                "Street food sampling",
+                "Night market browsing",
+                "Evening shopping",
+                "Cultural food exploration",
+                "Vendor interaction",
+                "Social night dining",
+                "Market stall hopping"
+            ],
+            "asian_temple_area": [
+                "Temple visiting",
+                "Cultural site exploration",
+                "Spiritual observance",
+                "Traditional rituals",
+                "Historical site appreciation",
+                "Religious tourism",
+                "Cultural photography"
+            ],
+            "european_plaza": [
+                "Urban sightseeing",
+                "Historical appreciation",
+                "Tourist photography",
+                "Public space relaxation",
+                "Casual strolling"
+            ],
+            "nighttime_street": [
+                "Evening commuting",
+                "Night walking",
+                "After-hours travel",
+                "Nighttime navigation",
+                "Evening errands",
+                "Late-night transportation",
+                "Nocturnal urban movement"
+            ],
+            "nighttime_commercial_district": [
+                "Evening shopping",
+                "Nightlife participation",
+                "Nighttime entertainment",
+                "After-dark dining",
+                "Evening social gathering",
+                "Night market browsing",
+                "Illumination appreciation"
+            ],
+            "indoor_outdoor_cafe": [
+                "Al fresco dining",
+                "Sidewalk coffee enjoyment",
+                "Indoor-outdoor socializing",
+                "Patio relaxation",
+                "Open-air refreshment",
+                "Transitional space usage",
+                "Weather-dependent positioning"
+            ],
+            "transit_station_platform": [
+                "Transit waiting",
+                "Platform navigation",
+                "Boarding preparation",
+                "Arrival monitoring",
+                "Schedule checking",
+                "Departure positioning",
+                "Platform traversal"
+            ],
+            "sports_stadium": [
+                "Spectator viewing",
+                "Sports fan cheering",
+                "Game attendance",
+                "Stadium navigation",
+                "Athletic event watching",
+                "Audience participation",
+                "Sports appreciation"
+            ],
+            "construction_site": [
+                "Construction work",
+                "Building development",
+                "Site management",
+                "Material handling",
+                "Construction supervision",
+                "Safety monitoring",
+                "Building process"
+            ],
+            "medical_facility": [
+                "Healthcare consultation",
+                "Medical treatment",
+                "Patient waiting",
+                "Healthcare delivery",
+                "Medical examination",
+                "Professional care",
+                "Health monitoring"
+            ],
+            "educational_setting": [
+                "Classroom learning",
+                "Educational instruction",
+                "Student participation",
+                "Academic engagement",
+                "Knowledge acquisition",
+                "Educational discussion",
+                "Scholastic activities"
+            ],
+            "beach_water_recreation": [
+                "Surfing",
+                "Sunbathing",
+                "Beach volleyball",
+                "Swimming",
+                "Relaxing by the water",
+                "Flying beach kites",
+                "Beach picnicking",
+                "Coastal walking"
+            ],
+            "sports_venue": [
+                "Professional game playing",
+                "Sports competition",
+                "Athletic training",
+                "Team practice",
+                "Spectator viewing",
+                "Sports coaching",
+                "Tournament participation",
+                "Athletic performance"
+            ],
+            "professional_kitchen": [
+                "Professional cooking",
+                "Food preparation",
+                "Meal service coordination",
+                "Kitchen operations",
+                "Culinary production",
+                "Chef activities",
+                "Commercial food handling",
+                "Restaurant meal preparation"
+            ]
+        }

app.py CHANGED Viewed

@@ -63,48 +63,102 @@ def process_and_plot(image, model_name, confidence_threshold, filter_classes=Non
         filter_classes: Optional list of classes to filter results
     Returns:
-        Tuple of (result_image, result_text, formatted_stats, plot_figure)
     """
-    class_ids = None
-    if filter_classes:
-        class_ids = []
-        for class_str in filter_classes:
-            try:
-                # Extract ID from format "id: name"
-                class_id = int(class_str.split(":")[0].strip())
-                class_ids.append(class_id)
-            except:
-                continue
-    # Execute detection
-    result_image, result_text, stats = image_processor.process_image(
-        image,
-        model_name,
-        confidence_threshold,
-        class_ids
-    )
-    # Format the statistics for better display
-    formatted_stats = image_processor.format_json_for_display(stats)
-    if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
-        # Create the table
         fig, ax = plt.subplots(figsize=(8, 6))
-        ax.text(0.5, 0.5, "No detection data available",
-                ha='center', va='center', fontsize=14, fontfamily='Arial')
         ax.set_xlim(0, 1)
         ax.set_ylim(0, 1)
         ax.axis('off')
-        plot_figure = fig
-    else:
-        # Prepare visualization data
-        available_classes = dict(get_all_classes())
-        viz_data = image_processor.prepare_visualization_data(stats, available_classes)
-        # Create plot
-        plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
-    return result_image, result_text, formatted_stats, plot_figure
 def create_interface():
     """創建 Gradio 界面，包含美化的視覺效果"""
@@ -121,19 +175,43 @@ def create_interface():
     # 創建 Gradio Blocks 界面
     with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
-        # 頁面頂部標題
         with gr.Group(elem_classes="app-header"):
-            gr.HTML("""
-                <div style="text-align: center; width: 100%;">
-                    <h1 class="app-title">VisionScout</h1>
-                    <h2 class="app-subtitle">Detect and identify objects in your images</h2>
-                    <div class="app-divider"></div>
-                </div>
-            """)
         current_model = gr.State("yolov8m.pt")  # use medium size model as defualt
-        # 主要內容區
         with gr.Row(equal_height=True):
             # 左側 - 輸入控制區(可上傳圖片)
             with gr.Column(scale=4, elem_classes="input-panel"):
@@ -208,8 +286,8 @@ def create_interface():
                             # 文本框設置，讓顯示會更寬
                             result_text = gr.Textbox(
                                 label=None,
-                                lines=12,
-                                max_lines=15,
                                 elem_classes="wide-result-text",
                                 elem_id="detection-details",
                                 container=False,
@@ -217,6 +295,57 @@ def create_interface():
                                 min_width=600
                             )
                     with gr.Tab("Statistics"):
                         with gr.Row():
                             with gr.Column(scale=3, elem_classes="plot-column"):
@@ -235,10 +364,14 @@ def create_interface():
                                 )
         detect_btn.click(
-            fn=process_and_plot,
-            inputs=[image_input, current_model, confidence, class_filter],
-            outputs=[result_image, result_text, stats_json, plot_output]
-        )
         # model option
         model_dropdown.change(
@@ -276,9 +409,9 @@ def create_interface():
         example_images = [
             "room_01.jpg",
-            "street_01.jpg",
             "street_02.jpg",
-            "street_03.jpg"
         ]
         # add example images

         filter_classes: Optional list of classes to filter results
     Returns:
+        Tuple of results including lighting conditions
     """
+    try:
+        class_ids = None
+        if filter_classes:
+            class_ids = []
+            for class_str in filter_classes:
+                try:
+                    # Extract ID from format "id: name"
+                    class_id = int(class_str.split(":")[0].strip())
+                    class_ids.append(class_id)
+                except:
+                    continue
+        # Execute detection
+        result_image, result_text, stats = image_processor.process_image(
+            image,
+            model_name,
+            confidence_threshold,
+            class_ids
+        )
+        # Format the statistics for better display
+        formatted_stats = image_processor.format_json_for_display(stats)
+        if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
+            # Create the table
+            fig, ax = plt.subplots(figsize=(8, 6))
+            ax.text(0.5, 0.5, "No detection data available",
+                    ha='center', va='center', fontsize=14, fontfamily='Arial')
+            ax.set_xlim(0, 1)
+            ax.set_ylim(0, 1)
+            ax.axis('off')
+            plot_figure = fig
+        else:
+            # Prepare visualization data
+            available_classes = dict(get_all_classes())
+            viz_data = image_processor.prepare_visualization_data(stats, available_classes)
+            # Create plot
+            plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
+        # Extract scene analysis info
+        scene_analysis = stats.get("scene_analysis", {})
+        scene_desc = scene_analysis.get("description", "No scene analysis available.")
+        scene_desc = scene_desc.strip()
+        # HTML format
+        scene_desc_html = f"""
+        <div id='scene-desc-container' style='width:100%; padding:20px; text-align:center; background-color:#f5f9fc; border-radius:8px; margin:10px auto; min-height:200px; max-height:none; overflow-y:auto;'>
+            <div style='width:100%; text-align:center; margin:0 auto; font-family:Arial, sans-serif; font-size:14px; line-height:1.8;'>
+                {scene_desc}
+            </div>
+        </div>
+        """
+        # Extract lighting conditions
+        lighting_conditions = scene_analysis.get("lighting_conditions",
+                                               {"time_of_day": "unknown", "confidence": 0.0})
+        # 準備活動列表
+        activities = scene_analysis.get("possible_activities", [])
+        if not activities:
+            activities_data = [["No activities detected"]]
+        else:
+            activities_data = [[activity] for activity in activities]
+        # 準備安全注意事項列表
+        safety_concerns = scene_analysis.get("safety_concerns", [])
+        if not safety_concerns:
+            safety_data = [["No safety concerns detected"]]
+        else:
+            safety_data = [[concern] for concern in safety_concerns]
+        # 功能區域
+        zones = scene_analysis.get("functional_zones", {})
+        return result_image, result_text, formatted_stats, plot_figure, scene_desc, activities_data, safety_data, zones, lighting_conditions
+    except Exception as e:
+        # 添加錯誤處理，確保即使出錯也能返回有效的數據
+        import traceback
+        error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        # 創建一個簡單的錯誤圖
         fig, ax = plt.subplots(figsize=(8, 6))
+        ax.text(0.5, 0.5, f"Error: {str(e)}",
+                ha='center', va='center', fontsize=14, fontfamily='Arial', color='red')
         ax.set_xlim(0, 1)
         ax.set_ylim(0, 1)
         ax.axis('off')
+        # 返回有效的默認值
+        return None, error_msg, "{}", fig, "Error processing image", [["No activities"]], [["No safety concerns"]], {}, {"time_of_day": "unknown", "confidence": 0}
 def create_interface():
     """創建 Gradio 界面，包含美化的視覺效果"""
     # 創建 Gradio Blocks 界面
     with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
+        # 主頁頂部的標題
         with gr.Group(elem_classes="app-header"):
+              gr.HTML("""
+                    <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
+                        <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
+                        <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Detect and identify objects in your images</h2>
+                        <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;">
+                            <div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div>
+                        </div>
+                        <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
+                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
+                                <span style="margin-right: 6px;">🔍</span> Object Detection
+                            </div>
+                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
+                                <span style="margin-right: 6px;">🌐</span> Scene Understanding
+                            </div>
+                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
+                                <span style="margin-right: 6px;">📊</span> Visual Analysis
+                            </div>
+                        </div>
+                        <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
+                            <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
+                                <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images are not supported.
+                                <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG here</a> before uploading.
+                            </p>
+                        </div>
+                    </div>
+                """)
         current_model = gr.State("yolov8m.pt")  # use medium size model as defualt
+        # 主要內容區
         with gr.Row(equal_height=True):
             # 左側 - 輸入控制區(可上傳圖片)
             with gr.Column(scale=4, elem_classes="input-panel"):
                             # 文本框設置，讓顯示會更寬
                             result_text = gr.Textbox(
                                 label=None,
+                                lines=15,
+                                max_lines=20,
                                 elem_classes="wide-result-text",
                                 elem_id="detection-details",
                                 container=False,
                                 min_width=600
                             )
+                    # Scene Analysis
+                    with gr.Tab("Scene Understanding", elem_classes="scene-understanding-tab"):
+                        with gr.Group(elem_classes="result-details-box"):
+                            gr.HTML("""
+                                <div class="section-heading">Scene Analysis</div>
+                                <details class="info-details" style="margin: 5px 0 15px 0;">
+                                    <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
+                                        🔍 The AI Vision Scout Report: Click for important notes about this analysis
+                                    </summary>
+                                    <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
+                                        <p style="font-size: 13px; color: #718096; margin: 0;">
+                                            <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
+                                            Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
+                                            Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
+                                        </p>
+                                    </div>
+                                </details>
+                            """)
+                            # 使用更適合長文本的容器
+                            with gr.Group(elem_classes="scene-description-container"):
+                                scene_description = gr.HTML(
+                                        value="<div id='scene-desc-container'></div>",
+                                        label="Scene Description"
+                                    )
+                            with gr.Row():
+                                with gr.Column(scale=2):
+                                    activities_list = gr.Dataframe(
+                                        headers=["Activities"],
+                                        datatype=["str"],
+                                        col_count=1,
+                                        row_count=5,
+                                        elem_classes="full-width-element"
+                                    )
+                                with gr.Column(scale=2):
+                                    safety_list = gr.Dataframe(
+                                        headers=["Safety Concerns"],
+                                        datatype=["str"],
+                                        col_count=1,
+                                        row_count=5,
+                                        elem_classes="full-width-element"
+                                    )
+                            gr.HTML('<div class="section-heading">Functional Zones</div>')
+                            zones_json = gr.JSON(label=None, elem_classes="json-box")
+                            gr.HTML('<div class="section-heading">Lighting Conditions</div>')
+                            lighting_info = gr.JSON(label=None, elem_classes="json-box")
                     with gr.Tab("Statistics"):
                         with gr.Row():
                             with gr.Column(scale=3, elem_classes="plot-column"):
                                 )
         detect_btn.click(
+                fn=process_and_plot,
+                inputs=[image_input, current_model, confidence, class_filter],
+                outputs=[
+                    result_image, result_text, stats_json, plot_output,
+                    scene_description, activities_list, safety_list, zones_json,
+                    lighting_info
+                ]
+            )
         # model option
         model_dropdown.change(
         example_images = [
             "room_01.jpg",
+            "room_02.jpg",
             "street_02.jpg",
+            "street_04.jpg"
         ]
         # add example images

clip_analyzer.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import torch
+import clip
+import numpy as np
+from PIL import Image
+from typing import Dict, List, Tuple, Any, Optional, Union
+from clip_prompts import (
+    SCENE_TYPE_PROMPTS,
+    CULTURAL_SCENE_PROMPTS,
+    COMPARATIVE_PROMPTS,
+    LIGHTING_CONDITION_PROMPTS,
+    SPECIALIZED_SCENE_PROMPTS,
+    VIEWPOINT_PROMPTS,
+    OBJECT_COMBINATION_PROMPTS,
+    ACTIVITY_PROMPTS
+)
+class CLIPAnalyzer:
+    """
+    Use Clip to intergrate scene understanding function
+    """
+    def __init__(self, model_name: str = "ViT-B/32", device: str = None):
+        """
+        初始化 CLIP 分析器。
+        Args:
+            model_name: CLIP Model name,  "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
+            device: Use GPU if it can use
+        """
+        # 自動選擇設備
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        print(f"Loading CLIP model {model_name} on {self.device}...")
+        try:
+            self.model, self.preprocess = clip.load(model_name, device=self.device)
+            print(f"CLIP model loaded successfully.")
+        except Exception as e:
+            print(f"Error loading CLIP model: {e}")
+            raise
+        self.scene_type_prompts = SCENE_TYPE_PROMPTS
+        self.cultural_scene_prompts = CULTURAL_SCENE_PROMPTS
+        self.comparative_prompts = COMPARATIVE_PROMPTS
+        self.lighting_condition_prompts = LIGHTING_CONDITION_PROMPTS
+        self.specialized_scene_prompts = SPECIALIZED_SCENE_PROMPTS
+        self.viewpoint_prompts = VIEWPOINT_PROMPTS
+        self.object_combination_prompts = OBJECT_COMBINATION_PROMPTS
+        self.activity_prompts = ACTIVITY_PROMPTS
+        # turn to CLIP format
+        self._prepare_text_prompts()
+    def _prepare_text_prompts(self):
+        """準備所有文本提示的 CLIP 特徵"""
+        # base prompt
+        scene_texts = [self.scene_type_prompts[scene_type] for scene_type in self.scene_type_prompts]
+        self.scene_type_tokens = clip.tokenize(scene_texts).to(self.device)
+        # cultural
+        self.cultural_tokens_dict = {}
+        for scene_type, prompts in self.cultural_scene_prompts.items():
+            self.cultural_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
+        # Light
+        lighting_texts = [self.lighting_condition_prompts[cond] for cond in self.lighting_condition_prompts]
+        self.lighting_tokens = clip.tokenize(lighting_texts).to(self.device)
+        # specializes_status
+        self.specialized_tokens_dict = {}
+        for scene_type, prompts in self.specialized_scene_prompts.items():
+            self.specialized_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
+        # view point
+        viewpoint_texts = [self.viewpoint_prompts[viewpoint] for viewpoint in self.viewpoint_prompts]
+        self.viewpoint_tokens = clip.tokenize(viewpoint_texts).to(self.device)
+        # object combination
+        object_combination_texts = [self.object_combination_prompts[combo] for combo in self.object_combination_prompts]
+        self.object_combination_tokens = clip.tokenize(object_combination_texts).to(self.device)
+        # activicty prompt
+        activity_texts = [self.activity_prompts[activity] for activity in self.activity_prompts]
+        self.activity_tokens = clip.tokenize(activity_texts).to(self.device)
+    def analyze_image(self, image, include_cultural_analysis: bool = True) -> Dict[str, Any]:
+        """
+        分析圖像，預測場景類型和光照條件。
+        Args:
+            image: 輸入圖像 (PIL Image 或 numpy array)
+            include_cultural_analysis: 是否包含文化場景的詳細分析
+        Returns:
+            Dict: 包含場景類型預測和光照條件的分析結果
+        """
+        try:
+            # 確保圖像是 PIL 格式
+            if not isinstance(image, Image.Image):
+                if isinstance(image, np.ndarray):
+                    image = Image.fromarray(image)
+                else:
+                    raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+            # 預處理圖像
+            image_input = self.preprocess(image).unsqueeze(0).to(self.device)
+            # 獲取圖像特徵
+            with torch.no_grad():
+                image_features = self.model.encode_image(image_input)
+                image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            # 分析場景類型
+            scene_scores = self._analyze_scene_type(image_features)
+            # 分析光照條件
+            lighting_scores = self._analyze_lighting_condition(image_features)
+            # 文化場景的增強分析
+            cultural_analysis = {}
+            if include_cultural_analysis:
+                for scene_type in self.cultural_scene_prompts:
+                    if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
+                        cultural_analysis[scene_type] = self._analyze_cultural_scene(
+                            image_features, scene_type
+                        )
+            specialized_analysis = {}
+            for scene_type in self.specialized_scene_prompts:
+                if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
+                    specialized_analysis[scene_type] = self._analyze_specialized_scene(
+                        image_features, scene_type
+                    )
+            viewpoint_scores = self._analyze_viewpoint(image_features)
+            object_combination_scores = self._analyze_object_combinations(image_features)
+            activity_scores = self._analyze_activities(image_features)
+            # display results
+            result = {
+                "scene_scores": scene_scores,
+                "top_scene": max(scene_scores.items(), key=lambda x: x[1]),
+                "lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]),
+                "embedding": image_features.cpu().numpy().tolist()[0] if self.device == "cuda" else image_features.numpy().tolist()[0],
+                "viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]),
+                "object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3],
+                "activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3]
+            }
+            if cultural_analysis:
+                result["cultural_analysis"] = cultural_analysis
+            if specialized_analysis:
+                result["specialized_analysis"] = specialized_analysis
+            return result
+        except Exception as e:
+            print(f"Error analyzing image with CLIP: {e}")
+            import traceback
+            traceback.print_exc()
+            return {"error": str(e)}
+    def _analyze_scene_type(self, image_features: torch.Tensor) -> Dict[str, float]:
+        """分析圖像特徵與各場景類型的相似度"""
+        with torch.no_grad():
+            # 計算場景類型文本特徵
+            text_features = self.model.encode_text(self.scene_type_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # 計算相似度分數
+            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+            # 建立場景分數字典
+            scene_scores = {}
+            for i, scene_type in enumerate(self.scene_type_prompts.keys()):
+                scene_scores[scene_type] = float(similarity[i])
+            return scene_scores
+    def _analyze_lighting_condition(self, image_features: torch.Tensor) -> Dict[str, float]:
+        """分析圖像的光照條件"""
+        with torch.no_grad():
+            # 計算光照條件文本特徵
+            text_features = self.model.encode_text(self.lighting_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # 計算相似度分數
+            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+            # 建立光照條件分數字典
+            lighting_scores = {}
+            for i, lighting_type in enumerate(self.lighting_condition_prompts.keys()):
+                lighting_scores[lighting_type] = float(similarity[i])
+            return lighting_scores
+    def _analyze_cultural_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
+        """針對特定文化場景進行深入分析"""
+        if scene_type not in self.cultural_tokens_dict:
+            return {"error": f"No cultural analysis available for {scene_type}"}
+        with torch.no_grad():
+            # 獲取特定文化場景的文本特徵
+            cultural_tokens = self.cultural_tokens_dict[scene_type]
+            text_features = self.model.encode_text(cultural_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # 計算相似度分數
+            similarity = (100 * image_features @ text_features.T)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+            # 找到最匹配的文化描述
+            prompts = self.cultural_scene_prompts[scene_type]
+            scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
+            scores.sort(key=lambda x: x[1], reverse=True)
+            return {
+                "best_description": scores[0][0],
+                "confidence": scores[0][1],
+                "all_matches": scores
+            }
+    def _analyze_specialized_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
+        """針對特定專門場景進行深入分析"""
+        if scene_type not in self.specialized_tokens_dict:
+            return {"error": f"No specialized analysis available for {scene_type}"}
+        with torch.no_grad():
+            # 獲取特定專門場景的文本特徵
+            specialized_tokens = self.specialized_tokens_dict[scene_type]
+            text_features = self.model.encode_text(specialized_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # 計算相似度分數
+            similarity = (100 * image_features @ text_features.T)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+            # 找到最匹配的專門描述
+            prompts = self.specialized_scene_prompts[scene_type]
+            scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
+            scores.sort(key=lambda x: x[1], reverse=True)
+            return {
+                "best_description": scores[0][0],
+                "confidence": scores[0][1],
+                "all_matches": scores
+            }
+    def _analyze_viewpoint(self, image_features: torch.Tensor) -> Dict[str, float]:
+        """分析圖像的拍攝視角"""
+        with torch.no_grad():
+            # 計算視角文本特徵
+            text_features = self.model.encode_text(self.viewpoint_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # 計算相似度分數
+            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+            # 建立視角分數字典
+            viewpoint_scores = {}
+            for i, viewpoint in enumerate(self.viewpoint_prompts.keys()):
+                viewpoint_scores[viewpoint] = float(similarity[i])
+            return viewpoint_scores
+    def _analyze_object_combinations(self, image_features: torch.Tensor) -> Dict[str, float]:
+        """分析圖像中的物體組合"""
+        with torch.no_grad():
+            # 計算物體組合文本特徵
+            text_features = self.model.encode_text(self.object_combination_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # 計算相似度分數
+            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+            # 建立物體組合分數字典
+            combination_scores = {}
+            for i, combination in enumerate(self.object_combination_prompts.keys()):
+                combination_scores[combination] = float(similarity[i])
+            return combination_scores
+    def _analyze_activities(self, image_features: torch.Tensor) -> Dict[str, float]:
+        """分析圖像中的活動"""
+        with torch.no_grad():
+            # 計算活動文本特徵
+            text_features = self.model.encode_text(self.activity_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            # 計算相似度分數
+            similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+            # 建立活動分數字典
+            activity_scores = {}
+            for i, activity in enumerate(self.activity_prompts.keys()):
+                activity_scores[activity] = float(similarity[i])
+            return activity_scores
+    def get_image_embedding(self, image) -> np.ndarray:
+        """
+        獲取圖像的 CLIP 嵌入表示
+        Args:
+            image: PIL Image 或 numpy array
+        Returns:
+            np.ndarray: 圖像的 CLIP 特徵向量
+        """
+        # 確保圖像是 PIL 格式
+        if not isinstance(image, Image.Image):
+            if isinstance(image, np.ndarray):
+                image = Image.fromarray(image)
+            else:
+                raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
+        # 預處理並編碼
+        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            image_features = self.model.encode_image(image_input)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        # 轉換為 numpy 並返回
+        return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]
+    def text_to_embedding(self, text: str) -> np.ndarray:
+        """
+        將文本轉換為 CLIP 嵌入表示
+        Args:
+            text: 輸入文本
+        Returns:
+            np.ndarray: 文本的 CLIP 特徵向量
+        """
+        text_token = clip.tokenize([text]).to(self.device)
+        with torch.no_grad():
+            text_features = self.model.encode_text(text_token)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        return text_features.cpu().numpy()[0] if self.device == "cuda" else text_features.numpy()[0]
+    def calculate_similarity(self, image, text_queries: List[str]) -> Dict[str, float]:
+        """
+        計算圖像與多個文本查詢的相似度
+        Args:
+            image: PIL Image 或 numpy array
+            text_queries: 文本查詢列表
+        Returns:
+            Dict: 每個查詢的相似度分數
+        """
+        # 獲取圖像嵌入
+        if isinstance(image, np.ndarray) and len(image.shape) == 1:
+            # 已經是嵌入向量
+            image_features = torch.tensor(image).unsqueeze(0).to(self.device)
+        else:
+            # 是圖像，需要提取嵌入
+            image_features = torch.tensor(self.get_image_embedding(image)).unsqueeze(0).to(self.device)
+        # calulate similarity
+        text_tokens = clip.tokenize(text_queries).to(self.device)
+        with torch.no_grad():
+            text_features = self.model.encode_text(text_tokens)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+            similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
+        # display results
+        result = {}
+        for i, query in enumerate(text_queries):
+            result[query] = float(similarity[i])
+        return result

clip_prompts.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# 場景類型提示
+SCENE_TYPE_PROMPTS = {
+    # 基本室內場景
+    "living_room": "A photo of a living room with furniture and entertainment systems.",
+    "bedroom": "A photo of a bedroom with a bed and personal items.",
+    "dining_area": "A photo of a dining area with a table and chairs for meals.",
+    "kitchen": "A photo of a kitchen with cooking appliances and food preparation areas.",
+    "office_workspace": "A photo of an office workspace with desk, computer and work equipment.",
+    "meeting_room": "A photo of a meeting room with a conference table and multiple chairs.",
+    # 基本室外/城市場景
+    "city_street": "A photo of a city street with traffic, pedestrians and urban buildings.",
+    "parking_lot": "A photo of a parking lot with multiple parked vehicles.",
+    "park_area": "A photo of a park or recreational area with greenery and outdoor facilities.",
+    "retail_store": "A photo of a retail store with merchandise displays and shopping areas.",
+    "supermarket": "A photo of a supermarket with food items, aisles and shopping carts.",
+    # 特殊室內場景
+    "upscale_dining": "A photo of an upscale dining area with elegant furniture and refined decor.",
+    "conference_room": "A photo of a professional conference room with presentation equipment and seating.",
+    "classroom": "A photo of a classroom with desks, chairs and educational equipment.",
+    "library": "A photo of a library with bookshelves, reading areas and study spaces.",
+    # 亞洲特色場景
+    "asian_commercial_street": "A photo of an Asian commercial street with dense signage, shops and pedestrians.",
+    "asian_night_market": "A photo of an Asian night market with food stalls, crowds and colorful lights.",
+    "asian_temple_area": "A photo of an Asian temple with traditional architecture and cultural elements.",
+    # 交通相關場景
+    "financial_district": "A photo of a financial district with tall office buildings and business activity.",
+    "urban_intersection": "A photo of an urban intersection with crosswalks, traffic lights and pedestrians crossing.",
+    "transit_hub": "A photo of a transportation hub with multiple modes of public transit and passengers.",
+    "bus_stop": "A photo of a bus stop with people waiting and buses arriving or departing.",
+    "bus_station": "A photo of a bus terminal with multiple buses and traveler facilities.",
+    "train_station": "A photo of a train station with platforms, trains and passenger activity.",
+    "airport": "A photo of an airport with planes, terminals and traveler activity.",
+    # 商業場景
+    "shopping_district": "A photo of a shopping district with multiple retail stores and consumer activity.",
+    "cafe": "A photo of a cafe with coffee service, seating and casual dining.",
+    "restaurant": "A photo of a restaurant with dining tables, food service and eating areas.",
+    # 空中視角場景
+    "aerial_view_intersection": "An aerial view of an intersection showing crosswalks and traffic patterns from above.",
+    "aerial_view_commercial_area": "An aerial view of a commercial area showing shopping districts from above.",
+    "aerial_view_plaza": "An aerial view of a public plaza or square showing patterns of people movement from above.",
+    # 娛樂場景
+    "zoo": "A photo of a zoo with animal enclosures, exhibits and visitors.",
+    "playground": "A photo of a playground with recreational equipment and children playing.",
+    "sports_field": "A photo of a sports field with playing surfaces and athletic equipment.",
+    "sports_stadium": "A photo of a sports stadium with spectator seating and athletic facilities.",
+    # 水相關場景
+    "harbor": "A photo of a harbor with boats, docks and waterfront activity.",
+    "beach_water_recreation": "A photo of a beach area with water activities, sand and recreational equipment like surfboards.",
+    # 文化時間特定場景
+    "nighttime_street": "A photo of a street at night with artificial lighting and evening activity.",
+    "nighttime_commercial_district": "A photo of a commercial district at night with illuminated signs and evening shopping.",
+    "european_plaza": "A photo of a European-style plaza with historic architecture and public gathering spaces.",
+    # 混合環境場景
+    "indoor_outdoor_cafe": "A photo of a cafe with both indoor seating and outdoor patio areas.",
+    "transit_station_platform": "A photo of a transit station platform with waiting areas and arriving vehicles.",
+    # 工作場景
+    "construction_site": "A photo of a construction site with building materials, equipment and workers.",
+    "medical_facility": "A photo of a medical facility with healthcare equipment and professional staff.",
+    "educational_setting": "A photo of an educational setting with learning spaces and academic resources.",
+    "professional_kitchen": "A photo of a professional commercial kitchen with industrial cooking equipment and food preparation stations."
+}
+# 文化特定場景提示
+CULTURAL_SCENE_PROMPTS = {
+    "asian_commercial_street": [
+        "A busy Asian shopping street with neon signs and dense storefronts.",
+        "A commercial street in Asia with multi-level signage and narrow walkways.",
+        "A street scene in Taiwan or Hong Kong with vertical signage and compact shops.",
+        "A crowded commercial alley in an Asian city with signs in Chinese characters.",
+        "A narrow shopping street in Asia with small shops on both sides.",
+        "An outdoor shopping district in an East Asian city with electronic billboards.",
+        "A bustling commercial street in Taiwan with food vendors and retail shops.",
+        "A pedestrian shopping area with Korean or Chinese signs and storefronts.",
+        "A daytime shopping street in an Asian urban center with vertical development."
+    ],
+    "asian_night_market": [
+        "A vibrant night market in Asia with food stalls and large crowds.",
+        "An evening street market in Taiwan with street food vendors and bright lights.",
+        "A busy night bazaar in Asia with illuminated stalls and local food.",
+        "A crowded night street food market in an Asian city with vendor carts.",
+        "An Asian night market with steam from cooking food and hanging lanterns.",
+        "A nocturnal food street in East Asia with vendor canopies and neon lights.",
+        "A bustling evening market with rows of food stalls and plastic stools.",
+        "A lively Asian street food scene at night with cooking stations and crowds."
+    ],
+    "asian_temple_area": [
+        "A traditional Asian temple with ornate roof details and religious symbols.",
+        "A Buddhist temple complex in East Asia with multiple pavilions and prayer areas.",
+        "A sacred site in Asia with incense burners and ceremonial elements.",
+        "A temple courtyard with stone statues and traditional Asian architecture.",
+        "A spiritual center in East Asia with pagoda-style structures and visitors.",
+        "An ancient temple site with Asian architectural elements and cultural symbols.",
+        "A religious compound with characteristic Asian roof curves and decorative features."
+    ],
+    "european_plaza": [
+        "A historic European city square with classical architecture and cafes.",
+        "An old-world plaza in Europe with cobblestone paving and historic buildings.",
+        "A public square in a European city with fountains and surrounding architecture.",
+        "A central plaza in Europe with outdoor seating areas and historic monuments.",
+        "A traditional European town square with surrounding shops and restaurants.",
+        "A historic gathering space in Europe with distinctive architecture and pedestrians."
+    ]
+}
+# 對比類別提示
+COMPARATIVE_PROMPTS = {
+    "indoor_vs_outdoor": [
+        "An indoor shopping mall corridor with controlled lighting and storefronts.",
+        "An outdoor commercial street with natural lighting and urban storefronts.",
+        "An enclosed shopping gallery with artificial lighting and climate control.",
+        "An open-air market street with natural light and weather exposure."
+    ],
+    "professional_vs_home": [
+        "A professional commercial kitchen with stainless steel equipment and workstations.",
+        "A home kitchen with residential appliances and family cooking space.",
+        "A restaurant kitchen with multiple cooking stations and chef activity.",
+        "A family kitchen with standard household equipment and personal touches."
+    ],
+    "sports_venue_vs_park": [
+        "A professional sports stadium with designated playing areas and audience seating.",
+        "A public park with casual recreation space and community greenery.",
+        "An athletic venue with specialized sports equipment and competitive playing surfaces.",
+        "An outdoor community space with general purpose areas and natural elements."
+    ],
+    "asian_vs_western_commercial": [
+        "An Asian shopping street with vertical signage and compact multi-level shops.",
+        "A Western commercial street with horizontal storefronts and wider sidewalks.",
+        "An East Asian retail area with dense signage in Asian scripts and narrow walkways.",
+        "A Western shopping district with uniform building heights and Latin alphabetic signs."
+    ],
+    "daytime_vs_nighttime": [
+        "A daytime urban scene with natural sunlight illuminating streets and buildings.",
+        "A nighttime city scene with artificial lighting from stores, signs and streetlights.",
+        "A commercial district during daylight hours with natural shadows and visibility.",
+        "An evening urban setting with illuminated storefronts and light patterns on streets."
+    ],
+    "aerial_vs_street_level": [
+        "An aerial view showing urban patterns and layouts from above.",
+        "A street-level view showing pedestrian perspective and immediate surroundings.",
+        "A bird's-eye view of city organization and movement patterns from high above.",
+        "An eye-level perspective showing direct human interaction with urban elements."
+    ]
+}
+# 環境條件文本提示
+LIGHTING_CONDITION_PROMPTS = {
+    "day_clear": "A photo taken during daytime with clear skies and direct sunlight.",
+    "day_cloudy": "A photo taken during daytime with overcast conditions and diffused light.",
+    "sunset/sunrise": "A photo taken during sunset or sunrise with warm golden lighting and long shadows.",
+    "night": "A photo taken at night with minimal natural light and artificial illumination.",
+    "indoor_bright": "An indoor photo with bright, even artificial lighting throughout the space.",
+    "indoor_moderate": "An indoor photo with moderate lighting creating a balanced indoor atmosphere.",
+    "indoor_dim": "An indoor photo with low lighting levels creating a subdued environment.",
+    "neon_night": "A night scene with colorful neon lighting creating vibrant illumination patterns.",
+    "indoor_commercial": "An indoor retail environment with directed display lighting highlighting products.",
+    "indoor_restaurant": "An indoor dining space with ambient mood lighting for atmosphere.",
+    "stadium_lighting": "A sports venue with powerful floodlights creating intense, even illumination.",
+    "mixed_lighting": "A scene with combined natural and artificial light sources creating transition zones.",
+    "beach_daylight": "A photo taken at a beach with bright natural sunlight and reflections from water.",
+    "sports_arena_lighting": "A photo of a sports venue illuminated by powerful overhead lighting systems.",
+    "kitchen_task_lighting": "A photo of a kitchen with focused lighting concentrated on work surfaces."
+}
+# 針對新場景類型的特殊提示
+SPECIALIZED_SCENE_PROMPTS = {
+    "beach_water_recreation": [
+        "A coastal beach scene with people surfing and sunbathing on sandy shores.",
+        "Active water sports participants at a beach with surfboards and swimming areas.",
+        "A sunny beach destination with recreational water equipment and beachgoers.",
+        "A shoreline recreation area with surf gear and coastal activities.",
+        "An oceanfront scene with people engaging in water sports and beach leisure.",
+        "A popular beach spot with swimming areas and surfing zones.",
+        "A coastal recreation setting with beach umbrellas and water activities."
+    ],
+    "sports_venue": [
+        "An indoor sports arena with professional equipment and competition spaces.",
+        "A sports stadium with marked playing areas and spectator seating arrangement.",
+        "A specialized athletic venue with competition equipment and performance areas.",
+        "A professional sports facility with game-related apparatus and audience zones.",
+        "An organized sports center with competitive play areas and athletic equipment.",
+        "A competition venue with sport-specific markings and professional setup.",
+        "A formal athletic facility with standardized equipment and playing surfaces."
+    ],
+    "professional_kitchen": [
+        "A commercial restaurant kitchen with multiple cooking stations and food prep areas.",
+        "A professional culinary workspace with industrial appliances and chef activity.",
+        "A busy restaurant back-of-house with stainless steel equipment and meal preparation.",
+        "A commercial food service kitchen with chef workstations and specialized zones.",
+        "An industrial kitchen facility with specialized cooking equipment and prep surfaces.",
+        "A high-volume food production kitchen with professional-grade appliances.",
+        "A restaurant kitchen with distinct cooking areas and culinary workflow design."
+    ],
+    "urban_intersection": [
+        "A city intersection with crosswalks and traffic signals controlling movement.",
+        "A busy urban crossroad with pedestrian crossings and vehicle traffic.",
+        "A regulated street intersection with crosswalk markings and waiting pedestrians.",
+        "A metropolitan junction with traffic lights and pedestrian crossing zones.",
+        "A city street crossing with safety features for pedestrians and traffic flow.",
+        "A controlled urban intersection with movement patterns for vehicles and people.",
+        "A city center crossroad with traffic management features and pedestrian areas."
+    ],
+    "financial_district": [
+        "A downtown business area with tall office buildings and commercial activity.",
+        "An urban financial center with skyscrapers and professional environment.",
+        "A city's business district with corporate headquarters and office towers.",
+        "A metropolitan financial zone with high-rise buildings and business traffic.",
+        "A corporate district in a city center with professional architecture.",
+        "An urban area dominated by office buildings and business establishments.",
+        "A city's economic center with banking institutions and corporate offices."
+    ],
+    "aerial_view_intersection": [
+        "A bird's-eye view of a city intersection showing crossing patterns from above.",
+        "An overhead perspective of an urban crossroad showing traffic organization.",
+        "A top-down view of a street intersection revealing pedestrian crosswalks.",
+        "An aerial shot of a city junction showing the layout of roads and crossings.",
+        "A high-angle view of an intersection showing traffic and pedestrian flow patterns.",
+        "A drone perspective of urban crossing design viewed from directly above.",
+        "A vertical view of a street intersection showing crossing infrastructure."
+    ]
+}
+VIEWPOINT_PROMPTS = {
+    "eye_level": "A photo taken from normal human eye level showing a direct frontal perspective.",
+    "aerial": "A photo taken from high above looking directly down at the scene below.",
+    "elevated": "A photo taken from a higher than normal position looking down at an angle.",
+    "low_angle": "A photo taken from a low position looking upward at the scene.",
+    "bird_eye": "A photo taken from very high above showing a complete overhead perspective.",
+    "street_level": "A photo taken from the perspective of someone standing on the street.",
+    "interior": "A photo taken from inside a building showing the internal environment.",
+    "vehicular": "A photo taken from inside or mounted on a moving vehicle."
+}
+OBJECT_COMBINATION_PROMPTS = {
+    "dining_setting": "A scene with tables, chairs, plates, and eating utensils arranged for meals.",
+    "office_setup": "A scene with desks, chairs, computers, and office supplies for work.",
+    "living_space": "A scene with sofas, coffee tables, TVs, and comfortable seating arrangements.",
+    "transportation_hub": "A scene with vehicles, waiting areas, passengers, and transit information.",
+    "retail_environment": "A scene with merchandise displays, shoppers, and store fixtures.",
+    "crosswalk_scene": "A scene with street markings, pedestrians crossing, and traffic signals.",
+    "cooking_area": "A scene with stoves, prep surfaces, cooking utensils, and food items.",
+    "recreational_space": "A scene with sports equipment, play areas, and activity participants."
+}
+ACTIVITY_PROMPTS = {
+    "shopping": "People looking at merchandise, carrying shopping bags, and browsing stores.",
+    "dining": "People eating food, sitting at tables, and using dining utensils.",
+    "commuting": "People waiting for transportation, boarding vehicles, and traveling.",
+    "working": "People using computers, attending meetings, and engaged in professional tasks.",
+    "exercising": "People engaged in physical activities, using sports equipment, and training.",
+    "cooking": "People preparing food, using kitchen equipment, and creating meals.",
+    "crossing_street": "People walking across designated crosswalks and navigating intersections.",
+    "recreational_activity": "People engaged in leisure activities, games, and social recreation."
+}

color_mapper.py CHANGED Viewed

@@ -6,7 +6,7 @@ class ColorMapper:
     A class for consistent color mapping of object detection classes
     Provides color schemes for visualization in both RGB and hex formats
     """
     # Class categories for better organization
     CATEGORIES = {
         "person": [0],
@@ -21,8 +21,9 @@ class ColorMapper:
         "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
         "household": [71, 72, 73, 74, 75, 76, 77, 78, 79]
     }
     # Base colors for each category (in HSV for easier variation)
     CATEGORY_COLORS = {
         "person": (0, 0.8, 0.9),       # Red
         "vehicles": (210, 0.8, 0.9),   # Blue
@@ -36,43 +37,43 @@ class ColorMapper:
         "electronics": (240, 0.6, 0.9), # Light Blue
         "household": (60, 0.6, 0.9)    # Yellow
     }
     def __init__(self):
         """Initialize the ColorMapper with COCO class mappings"""
         self.class_names = self._get_coco_classes()
         self.color_map = self._generate_color_map()
     def _get_coco_classes(self) -> Dict[int, str]:
         """Get the standard COCO class names with their IDs"""
         return {
             0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
             5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
-            10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
             14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
             20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
             25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
             30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
-            35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
             39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
-            44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
             49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
-            54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
             59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
-            64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
             69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
-            74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
             79: 'toothbrush'
         }
     def _hsv_to_rgb(self, h: float, s: float, v: float) -> Tuple[int, int, int]:
         """
         Convert HSV color to RGB
         Args:
             h: Hue (0-360)
             s: Saturation (0-1)
             v: Value (0-1)
         Returns:
             Tuple of (R, G, B) values (0-255)
         """
@@ -82,7 +83,7 @@ class ColorMapper:
         p = v * (1 - s)
         q = v * (1 - s * f)
         t = v * (1 - s * (1 - f))
         if i == 0:
             r, g, b = v, t, p
         elif i == 1:
@@ -95,28 +96,28 @@ class ColorMapper:
             r, g, b = t, p, v
         else:
             r, g, b = v, p, q
         return (int(r * 255), int(g * 255), int(b * 255))
     def _rgb_to_hex(self, rgb: Tuple[int, int, int]) -> str:
         """
         Convert RGB color to hex color code
         Args:
             rgb: Tuple of (R, G, B) values (0-255)
         Returns:
             Hex color code (e.g. '#FF0000')
         """
         return f'#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}'
     def _find_category(self, class_id: int) -> str:
         """
         Find the category for a given class ID
         Args:
             class_id: Class ID (0-79)
         Returns:
             Category name
         """
@@ -124,11 +125,11 @@ class ColorMapper:
             if class_id in ids:
                 return category
         return "other"  # Fallback
     def _generate_color_map(self) -> Dict:
         """
         Generate a color map for all 80 COCO classes
         Returns:
             Dictionary mapping class IDs and names to color values
         """
@@ -137,7 +138,7 @@ class ColorMapper:
             'by_name': {},    # Map class name to RGB and hex
             'categories': {}  # Map category to base color
         }
         # Generate colors for categories
         for category, hsv in self.CATEGORY_COLORS.items():
             rgb = self._hsv_to_rgb(hsv[0], hsv[1], hsv[2])
@@ -146,54 +147,54 @@ class ColorMapper:
                 'rgb': rgb,
                 'hex': hex_color
             }
         # Generate variations for each class within a category
         for class_id, class_name in self.class_names.items():
             category = self._find_category(class_id)
             base_hsv = self.CATEGORY_COLORS.get(category, (0, 0, 0.8))  # Default gray
             # Slightly vary the hue and saturation within the category
             ids_in_category = self.CATEGORIES.get(category, [])
             if ids_in_category:
                 position = ids_in_category.index(class_id) if class_id in ids_in_category else 0
                 variation = position / max(1, len(ids_in_category) - 1)  # 0 to 1
                 # Vary hue slightly (±15°) and saturation
                 h_offset = 30 * variation - 15  # -15 to +15
                 s_offset = 0.2 * variation  # 0 to 0.2
                 h = (base_hsv[0] + h_offset) % 360
                 s = min(1.0, base_hsv[1] + s_offset)
                 v = base_hsv[2]
             else:
                 h, s, v = base_hsv
             rgb = self._hsv_to_rgb(h, s, v)
             hex_color = self._rgb_to_hex(rgb)
             # Store in both mappings
             color_map['by_id'][class_id] = {
                 'rgb': rgb,
                 'hex': hex_color,
                 'category': category
             }
             color_map['by_name'][class_name] = {
                 'rgb': rgb,
                 'hex': hex_color,
                 'category': category
             }
         return color_map
     def get_color(self, class_identifier: Union[int, str], format: str = 'hex') -> Any:
         """
         Get color for a specific class
         Args:
             class_identifier: Class ID (int) or name (str)
             format: Color format ('hex', 'rgb', or 'bgr')
         Returns:
             Color in requested format
         """
@@ -202,11 +203,11 @@ class ColorMapper:
             color_info = self.color_map['by_id'].get(class_identifier)
         else:
             color_info = self.color_map['by_name'].get(class_identifier)
         if not color_info:
             # Fallback color if not found
             return '#CCCCCC' if format == 'hex' else (204, 204, 204)
         if format == 'hex':
             return color_info['hex']
         elif format == 'rgb':
@@ -217,14 +218,14 @@ class ColorMapper:
             return (b, g, r)
         else:
             return color_info['rgb']
     def get_all_colors(self, format: str = 'hex') -> Dict:
         """
         Get all colors in the specified format
         Args:
             format: Color format ('hex', 'rgb', or 'bgr')
         Returns:
             Dictionary mapping class names to colors
         """
@@ -232,14 +233,14 @@ class ColorMapper:
         for class_id, class_name in self.class_names.items():
             result[class_name] = self.get_color(class_id, format)
         return result
     def get_category_colors(self, format: str = 'hex') -> Dict:
         """
         Get base colors for each category
         Args:
             format: Color format ('hex', 'rgb', or 'bgr')
         Returns:
             Dictionary mapping categories to colors
         """
@@ -253,14 +254,14 @@ class ColorMapper:
             else:
                 result[category] = color_info['rgb']
         return result
     def get_category_for_class(self, class_identifier: Union[int, str]) -> str:
         """
         Get the category for a specific class
         Args:
             class_identifier: Class ID (int) or name (str)
         Returns:
             Category name
         """

     A class for consistent color mapping of object detection classes
     Provides color schemes for visualization in both RGB and hex formats
     """
     # Class categories for better organization
     CATEGORIES = {
         "person": [0],
         "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
         "household": [71, 72, 73, 74, 75, 76, 77, 78, 79]
     }
     # Base colors for each category (in HSV for easier variation)
+    # HSV:  Hue, Saturation, Value
     CATEGORY_COLORS = {
         "person": (0, 0.8, 0.9),       # Red
         "vehicles": (210, 0.8, 0.9),   # Blue
         "electronics": (240, 0.6, 0.9), # Light Blue
         "household": (60, 0.6, 0.9)    # Yellow
     }
     def __init__(self):
         """Initialize the ColorMapper with COCO class mappings"""
         self.class_names = self._get_coco_classes()
         self.color_map = self._generate_color_map()
     def _get_coco_classes(self) -> Dict[int, str]:
         """Get the standard COCO class names with their IDs"""
         return {
             0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
             5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
+            10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
             14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
             20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
             25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
             30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
+            35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
             39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
+            44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
             49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
+            54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
             59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
+            64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
             69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
+            74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
             79: 'toothbrush'
         }
     def _hsv_to_rgb(self, h: float, s: float, v: float) -> Tuple[int, int, int]:
         """
         Convert HSV color to RGB
         Args:
             h: Hue (0-360)
             s: Saturation (0-1)
             v: Value (0-1)
         Returns:
             Tuple of (R, G, B) values (0-255)
         """
         p = v * (1 - s)
         q = v * (1 - s * f)
         t = v * (1 - s * (1 - f))
         if i == 0:
             r, g, b = v, t, p
         elif i == 1:
             r, g, b = t, p, v
         else:
             r, g, b = v, p, q
         return (int(r * 255), int(g * 255), int(b * 255))
     def _rgb_to_hex(self, rgb: Tuple[int, int, int]) -> str:
         """
         Convert RGB color to hex color code
         Args:
             rgb: Tuple of (R, G, B) values (0-255)
         Returns:
             Hex color code (e.g. '#FF0000')
         """
         return f'#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}'
     def _find_category(self, class_id: int) -> str:
         """
         Find the category for a given class ID
         Args:
             class_id: Class ID (0-79)
         Returns:
             Category name
         """
             if class_id in ids:
                 return category
         return "other"  # Fallback
     def _generate_color_map(self) -> Dict:
         """
         Generate a color map for all 80 COCO classes
         Returns:
             Dictionary mapping class IDs and names to color values
         """
             'by_name': {},    # Map class name to RGB and hex
             'categories': {}  # Map category to base color
         }
         # Generate colors for categories
         for category, hsv in self.CATEGORY_COLORS.items():
             rgb = self._hsv_to_rgb(hsv[0], hsv[1], hsv[2])
                 'rgb': rgb,
                 'hex': hex_color
             }
         # Generate variations for each class within a category
         for class_id, class_name in self.class_names.items():
             category = self._find_category(class_id)
             base_hsv = self.CATEGORY_COLORS.get(category, (0, 0, 0.8))  # Default gray
             # Slightly vary the hue and saturation within the category
             ids_in_category = self.CATEGORIES.get(category, [])
             if ids_in_category:
                 position = ids_in_category.index(class_id) if class_id in ids_in_category else 0
                 variation = position / max(1, len(ids_in_category) - 1)  # 0 to 1
                 # Vary hue slightly (±15°) and saturation
                 h_offset = 30 * variation - 15  # -15 to +15
                 s_offset = 0.2 * variation  # 0 to 0.2
                 h = (base_hsv[0] + h_offset) % 360
                 s = min(1.0, base_hsv[1] + s_offset)
                 v = base_hsv[2]
             else:
                 h, s, v = base_hsv
             rgb = self._hsv_to_rgb(h, s, v)
             hex_color = self._rgb_to_hex(rgb)
             # Store in both mappings
             color_map['by_id'][class_id] = {
                 'rgb': rgb,
                 'hex': hex_color,
                 'category': category
             }
             color_map['by_name'][class_name] = {
                 'rgb': rgb,
                 'hex': hex_color,
                 'category': category
             }
         return color_map
     def get_color(self, class_identifier: Union[int, str], format: str = 'hex') -> Any:
         """
         Get color for a specific class
         Args:
             class_identifier: Class ID (int) or name (str)
             format: Color format ('hex', 'rgb', or 'bgr')
         Returns:
             Color in requested format
         """
             color_info = self.color_map['by_id'].get(class_identifier)
         else:
             color_info = self.color_map['by_name'].get(class_identifier)
         if not color_info:
             # Fallback color if not found
             return '#CCCCCC' if format == 'hex' else (204, 204, 204)
         if format == 'hex':
             return color_info['hex']
         elif format == 'rgb':
             return (b, g, r)
         else:
             return color_info['rgb']
     def get_all_colors(self, format: str = 'hex') -> Dict:
         """
         Get all colors in the specified format
         Args:
             format: Color format ('hex', 'rgb', or 'bgr')
         Returns:
             Dictionary mapping class names to colors
         """
         for class_id, class_name in self.class_names.items():
             result[class_name] = self.get_color(class_id, format)
         return result
     def get_category_colors(self, format: str = 'hex') -> Dict:
         """
         Get base colors for each category
         Args:
             format: Color format ('hex', 'rgb', or 'bgr')
         Returns:
             Dictionary mapping categories to colors
         """
             else:
                 result[category] = color_info['rgb']
         return result
     def get_category_for_class(self, class_identifier: Union[int, str]) -> str:
         """
         Get the category for a specific class
         Args:
             class_identifier: Class ID (int) or name (str)
         Returns:
             Category name
         """

confifence_templates.py ADDED Viewed

	@@ -0,0 +1,6 @@

+CONFIDENCE_TEMPLATES = {
+    "high": "{description} {details}",
+    "medium": "This appears to be {description} {details}",
+    "low": "This might be {description}, but the confidence is low. {details}"
+}

cultural_templates.py ADDED Viewed

	@@ -0,0 +1,19 @@

+CULTURAL_TEMPLATES = {
+    "asian": {
+        "elements": ["character signage", "lanterns", "dense urban layout"],
+        "description": "The scene shows distinctive Asian cultural elements such as {elements}."
+    },
+    "european": {
+        "elements": ["classical architecture", "cobblestone streets", "café terraces"],
+        "description": "The environment has European characteristics including {elements}."
+    },
+    "middle_eastern": {
+        "elements": ["ornate archways", "geometric patterns", "domed structures"],
+        "description": "The scene contains Middle Eastern architectural features such as {elements}."
+    },
+    "north_american": {
+        "elements": ["grid street pattern", "modern skyscrapers", "wide boulevards"],
+        "description": "The layout shows typical North American urban design with {elements}."
+    }
+}

detection_model.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 class DetectionModel:
     """Core detection model class for object detection using YOLOv8"""
     # Model information dictionary
     MODEL_INFO = {
         "yolov8n.pt": {
@@ -28,11 +28,11 @@ class DetectionModel:
             "inference_speed": "Slower"
         }
     }
-    def __init__(self, model_name: str = 'yolov8m.pt', confidence: float = 0.25, iou: float = 0.45):
         """
         Initialize the detection model
         Args:
             model_name: Model name or path, default is yolov8m.pt
             confidence: Confidence threshold, default is 0.25
@@ -44,10 +44,10 @@ class DetectionModel:
         self.model = None
         self.class_names = {}
         self.is_model_loaded = False
         # Load model on initialization
         self._load_model()
     def _load_model(self):
         """Load the YOLO model"""
         try:
@@ -60,57 +60,57 @@ class DetectionModel:
         except Exception as e:
             print(f"Error occurred when loading the model: {e}")
             self.is_model_loaded = False
     def change_model(self, new_model_name: str) -> bool:
         """
         Change the currently loaded model
         Args:
             new_model_name: Name of the new model to load
         Returns:
             bool: True if model changed successfully, False otherwise
         """
         if self.model_name == new_model_name and self.is_model_loaded:
             print(f"Model {new_model_name} is already loaded")
             return True
         print(f"Changing model from {self.model_name} to {new_model_name}")
         # Unload current model to free memory
         if self.model is not None:
             del self.model
             self.model = None
             # Clean GPU memory if available
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         # Update model name and load new model
         self.model_name = new_model_name
         self._load_model()
         return self.is_model_loaded
     def reload_model(self):
         """Reload the model (useful for changing model or after error)"""
         if self.model is not None:
             del self.model
             self.model = None
             # Clean GPU memory if available
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         self._load_model()
     def detect(self, image_input: Any) -> Optional[Any]:
         """
         Perform object detection on a single image
         Args:
             image_input: Image path (str), PIL Image, or numpy array
         Returns:
             Detection result object or None if error occurred
         """
@@ -120,27 +120,27 @@ class DetectionModel:
             if self.model is None or not self.is_model_loaded:
                 print("Failed to load model. Cannot perform detection.")
                 return None
         try:
             results = self.model(image_input, conf=self.confidence, iou=self.iou)
             return results[0]
         except Exception as e:
             print(f"Error occurred during detection: {e}")
             return None
     def get_class_names(self, class_id: int) -> str:
         """Get class name for a given class ID"""
         return self.class_names.get(class_id, "Unknown Class")
     def get_supported_classes(self) -> Dict[int, str]:
         """Get all supported classes as a dictionary of {id: class_name}"""
         return self.class_names
     @classmethod
     def get_available_models(cls) -> List[Dict]:
         """
         Get list of available models with their information
         Returns:
             List of dictionaries containing model information
         """
@@ -154,7 +154,7 @@ class DetectionModel:
                 "inference_speed": info["inference_speed"]
             })
         return models
     @classmethod
     def get_model_description(cls, model_name: str) -> str:
         """Get description for a specific model"""

 class DetectionModel:
     """Core detection model class for object detection using YOLOv8"""
     # Model information dictionary
     MODEL_INFO = {
         "yolov8n.pt": {
             "inference_speed": "Slower"
         }
     }
+    def __init__(self, model_name: str = 'yolov8m.pt', confidence: float = 0.25, iou: float = 0.25):
         """
         Initialize the detection model
         Args:
             model_name: Model name or path, default is yolov8m.pt
             confidence: Confidence threshold, default is 0.25
         self.model = None
         self.class_names = {}
         self.is_model_loaded = False
         # Load model on initialization
         self._load_model()
     def _load_model(self):
         """Load the YOLO model"""
         try:
         except Exception as e:
             print(f"Error occurred when loading the model: {e}")
             self.is_model_loaded = False
     def change_model(self, new_model_name: str) -> bool:
         """
         Change the currently loaded model
         Args:
             new_model_name: Name of the new model to load
         Returns:
             bool: True if model changed successfully, False otherwise
         """
         if self.model_name == new_model_name and self.is_model_loaded:
             print(f"Model {new_model_name} is already loaded")
             return True
         print(f"Changing model from {self.model_name} to {new_model_name}")
         # Unload current model to free memory
         if self.model is not None:
             del self.model
             self.model = None
             # Clean GPU memory if available
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         # Update model name and load new model
         self.model_name = new_model_name
         self._load_model()
         return self.is_model_loaded
     def reload_model(self):
         """Reload the model (useful for changing model or after error)"""
         if self.model is not None:
             del self.model
             self.model = None
             # Clean GPU memory if available
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         self._load_model()
     def detect(self, image_input: Any) -> Optional[Any]:
         """
         Perform object detection on a single image
         Args:
             image_input: Image path (str), PIL Image, or numpy array
         Returns:
             Detection result object or None if error occurred
         """
             if self.model is None or not self.is_model_loaded:
                 print("Failed to load model. Cannot perform detection.")
                 return None
         try:
             results = self.model(image_input, conf=self.confidence, iou=self.iou)
             return results[0]
         except Exception as e:
             print(f"Error occurred during detection: {e}")
             return None
     def get_class_names(self, class_id: int) -> str:
         """Get class name for a given class ID"""
         return self.class_names.get(class_id, "Unknown Class")
     def get_supported_classes(self) -> Dict[int, str]:
         """Get all supported classes as a dictionary of {id: class_name}"""
         return self.class_names
     @classmethod
     def get_available_models(cls) -> List[Dict]:
         """
         Get list of available models with their information
         Returns:
             List of dictionaries containing model information
         """
                 "inference_speed": info["inference_speed"]
             })
         return models
     @classmethod
     def get_model_description(cls, model_name: str) -> str:
         """Get description for a specific model"""

enhance_scene_describer.py ADDED Viewed

	@@ -0,0 +1,1314 @@

+import os
+import re
+import json
+import random
+import numpy as np
+from typing import Dict, List, Tuple, Any, Optional
+from scene_type import SCENE_TYPES
+from scene_detail_templates import SCENE_DETAIL_TEMPLATES
+from object_template_fillers import OBJECT_TEMPLATE_FILLERS
+from lighting_conditions import LIGHTING_CONDITIONS
+from viewpoint_templates import VIEWPOINT_TEMPLATES
+from cultural_templates import CULTURAL_TEMPLATES
+from confifence_templates import CONFIDENCE_TEMPLATES
+class EnhancedSceneDescriber:
+    """
+    Enhanced scene description generator with improved template handling,
+    viewpoint awareness, and cultural context recognition.
+    Provides detailed natural language descriptions of scenes based on
+    detection results and scene classification.
+    """
+    def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None):
+        """
+        Initialize the enhanced scene describer.
+        Args:
+            templates_db: Optional custom templates database
+            scene_types: Dictionary of scene type definitions
+        """
+        # Load or use provided scene types
+        self.scene_types = scene_types or self._load_default_scene_types()
+        # Load templates database
+        self.templates = templates_db or self._load_templates()
+        # Initialize viewpoint detection parameters
+        self._initialize_viewpoint_parameters()
+    def _load_default_scene_types(self) -> Dict:
+        """
+        Load default scene types.
+        Returns:
+            Dict: Scene type definitions
+        """
+        return SCENE_TYPES
+    def _load_templates(self) -> Dict:
+        """
+        Load description templates from imported Python modules.
+        Returns:
+            Dict: Template collections for different description components
+        """
+        templates = {}
+        # 直接從導入的 Python 模組中獲取模板
+        templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
+        templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
+        templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
+        templates["cultural_templates"] = CULTURAL_TEMPLATES
+        # 從 LIGHTING_CONDITIONS 獲取照明模板
+        templates["lighting_templates"] = {
+            key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items()
+        }
+        # 設置默認的置信度模板
+        templates["confidence_templates"] = {
+            "high": "{description} {details}",
+            "medium": "This appears to be {description} {details}",
+            "low": "This might be {description}, but the confidence is low. {details}"
+        }
+        # 初始化其他必要的模板（現在這個函數簡化了很多）
+        self._initialize_default_templates(templates)
+        return templates
+    def _initialize_default_templates(self, templates: Dict):
+        """
+        檢查模板字典並填充任何缺失的默認模板。
+        在將模板移至專門的模組後，此方法主要作為安全機制，
+        確保即使導入失敗或某些模板未在外部定義，系統仍能正常運行。
+        Args:
+            templates: 要檢查和更新的模板字典
+        """
+        # 檢查關鍵模板類型是否存在，如果不存在則添加默認值
+        # 置信度模板 - 用於控制描述的語氣
+        if "confidence_templates" not in templates:
+            templates["confidence_templates"] = {
+                "high": "{description} {details}",
+                "medium": "This appears to be {description} {details}",
+                "low": "This might be {description}, but the confidence is low. {details}"
+            }
+        # 場景細節模板 - 如果未從外部導入
+        if "scene_detail_templates" not in templates:
+            templates["scene_detail_templates"] = {
+                "default": ["A space with various objects."]
+            }
+        # 物體填充模板 - 用於生成物體描述
+        if "object_template_fillers" not in templates:
+            templates["object_template_fillers"] = {
+                "default": ["various items"]
+            }
+        # 視角模板 - 雖然我們現在從專門模組導入，但作為備份
+        if "viewpoint_templates" not in templates:
+            # 使用簡化版的默認視角模板
+            templates["viewpoint_templates"] = {
+                "eye_level": {
+                    "prefix": "From eye level, ",
+                    "observation": "the scene is viewed straight on."
+                },
+                "aerial": {
+                    "prefix": "From above, ",
+                    "observation": "the scene is viewed from a bird's-eye perspective."
+                }
+            }
+        # 文化模板
+        if "cultural_templates" not in templates:
+            templates["cultural_templates"] = {
+                "asian": {
+                    "elements": ["cultural elements"],
+                    "description": "The scene has Asian characteristics."
+                },
+                "european": {
+                    "elements": ["architectural features"],
+                    "description": "The scene has European characteristics."
+                }
+            }
+        # 照明模板 - 用於描述光照條件
+        if "lighting_templates" not in templates:
+            templates["lighting_templates"] = {
+                "day_clear": "The scene is captured during daylight.",
+                "night": "The scene is captured at night.",
+                "unknown": "The lighting conditions are not easily determined."
+            }
+    def _initialize_viewpoint_parameters(self):
+        """
+        Initialize parameters used for viewpoint detection.
+        """
+        self.viewpoint_params = {
+            # Parameters for detecting aerial views
+            "aerial_threshold": 0.7,  # High object density viewed from top
+            "aerial_size_variance_threshold": 0.15,  # Low size variance in aerial views
+            # Parameters for detecting low angle views
+            "low_angle_threshold": 0.3,  # Bottom-heavy object distribution
+            "vertical_size_ratio_threshold": 1.8,  # Vertical objects appear taller
+            # Parameters for detecting elevated views
+            "elevated_threshold": 0.6,  # Objects mostly in middle/bottom
+            "elevated_top_threshold": 0.3  # Few objects at top of frame
+        }
+    def generate_description(self,
+                        scene_type: str,
+                        detected_objects: List[Dict],
+                        confidence: float,
+                        lighting_info: Optional[Dict] = None,
+                        functional_zones: Optional[Dict] = None) -> str:
+        """
+        Generate enhanced scene description based on detection results, scene type,
+        and additional contextual information.
+        This is the main entry point that replaces the original _generate_scene_description.
+        Args:
+            scene_type: Identified scene type
+            detected_objects: List of detected objects
+            confidence: Scene classification confidence
+            lighting_info: Optional lighting condition information
+            functional_zones: Optional identified functional zones
+        Returns:
+            str: Natural language description of the scene
+        """
+        # Handle unknown scene type or very low confidence
+        if scene_type == "unknown" or confidence < 0.4:
+            return self._generate_generic_description(detected_objects, lighting_info)
+        # Detect viewpoint
+        viewpoint = self._detect_viewpoint(detected_objects)
+        if viewpoint == "aerial":
+            # 如果是十字路口相關的場景，確保使用正確的空中視角十字路口場景類型
+            if "intersection" in scene_type or self._is_intersection(detected_objects):
+                scene_type = "aerial_view_intersection"
+            # 如果是商業區相關的場景
+            elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
+                scene_type = "aerial_view_commercial_area"
+            # 如果是廣場相關的場景
+            elif any(keyword in scene_type for keyword in ["plaza", "square"]):
+                scene_type = "aerial_view_plaza"
+            # 其他空中視角場景，預設使用十字路口
+            else:
+                scene_type = "aerial_view_intersection"
+        # Detect cultural context - 只有在非空中視角時才檢測文化上下文
+        cultural_context = None
+        if viewpoint != "aerial":
+            cultural_context = self._detect_cultural_context(scene_type, detected_objects)
+        # Select appropriate template based on confidence
+        if confidence > 0.75:
+            confidence_level = "high"
+        elif confidence > 0.5:
+            confidence_level = "medium"
+        else:
+            confidence_level = "low"
+        # Get base description for the scene type
+        if viewpoint == "aerial":
+            # 空中視角時使用已設定的基本描述
+            if 'base_description' not in locals():
+                base_description = "An aerial view showing the layout and movement patterns from above"
+        elif scene_type in self.scene_types:
+            base_description = self.scene_types[scene_type].get("description", "A scene")
+        else:
+            base_description = "A scene"
+        # Generate detailed scene information
+        scene_details = self._generate_scene_details(
+            scene_type,
+            detected_objects,
+            lighting_info,
+            viewpoint
+        )
+        # 修正：根據人數改進描述
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # 人
+        if people_objs:
+            people_count = len(people_objs)
+            if people_count > 5:
+                # 當人數很多��，用更精確的措辭
+                people_phrase = f"numerous people ({people_count})"
+            else:
+                people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
+            # 將人數信息加入到場景詳情中
+            if "people" not in scene_details.lower() and "pedestrian" not in scene_details.lower():
+                scene_details += f" The scene includes {people_phrase}."
+        # Apply cultural context if detected (只在非空中視角時應用)
+        if cultural_context and scene_details and viewpoint != "aerial":
+            cultural_elements = self._generate_cultural_elements(cultural_context)
+            if cultural_elements:
+                scene_details += f" {cultural_elements}"
+        # Include lighting information if available
+        lighting_description = ""
+        if lighting_info and "time_of_day" in lighting_info:
+            lighting_type = lighting_info["time_of_day"]
+            if lighting_type in self.templates.get("lighting_templates", {}):
+                lighting_description = self.templates["lighting_templates"][lighting_type]
+        # Apply confidence template
+        description_template = self.templates["confidence_templates"].get(
+            confidence_level, "{description} {details}"
+        )
+        # Fill the template
+        description = description_template.format(
+            description=base_description,
+            details=scene_details
+        )
+        # Add viewpoint observation if viewpoint is not standard
+        if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
+            viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
+            # 在空中視角時，確保觀察描述反映更多細節
+            if viewpoint == "aerial":
+                scene_elements = "the crossing patterns and pedestrian movement"
+            else:
+                scene_elements = "objects and layout"
+            viewpoint_desc = viewpoint_template.get("observation", "").format(
+                scene_elements=scene_elements
+            )
+            # Add viewpoint prefix if needed
+            if not description.startswith(viewpoint_template.get("prefix", "")):
+                description = f"{viewpoint_template.get('prefix', '')}{description}"
+            # Add viewpoint observation if not already included
+            if viewpoint_desc not in description:
+                description += f" {viewpoint_desc}"
+        # Add lighting description if available
+        if lighting_description and lighting_description not in description:
+            description += f" {lighting_description}"
+        # Add information about functional zones if available
+        if functional_zones and len(functional_zones) > 0:
+            zones_desc = self._describe_functional_zones(functional_zones)
+            if zones_desc:
+                description += f" {zones_desc}"
+        # 計算真實的人數
+        people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
+        # 檢查描述中是否有人數信息的矛盾
+        if people_count > 5:
+            # 識別可能含有較小人數信息的片段
+            small_people_patterns = [
+                r"Area with \d+ people\.",
+                r"Area with \d+ person\.",
+                r"with \d+ people",
+                r"with \d+ person"
+            ]
+            # 對每個模式檢查並移除
+            filtered_description = description
+            for pattern in small_people_patterns:
+                matches = re.findall(pattern, filtered_description)
+                for match in matches:
+                    # 從匹配中提取人數
+                    number_match = re.search(r'\d+', match)
+                    if number_match:
+                        try:
+                            people_mentioned = int(number_match.group())
+                            # 如果提到的人數小於總人數，移除整個句子
+                            if people_mentioned < people_count:
+                                # 將描述分割成句子
+                                sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
+                                # 移除包含匹配片段的句子
+                                filtered_sentences = []
+                                for sentence in sentences:
+                                    if match not in sentence:
+                                        filtered_sentences.append(sentence)
+                                # 重新組合描述
+                                filtered_description = " ".join(filtered_sentences)
+                        except ValueError:
+                            # 數字轉換失敗，繼續處理
+                            continue
+            # 使用過濾後的描述
+            description = filtered_description
+        return description
+    def _is_intersection(self, detected_objects: List[Dict]) -> bool:
+        """
+        通過分析物體分佈來判斷場景是否為十字路口
+        """
+        # 檢查行人分佈模式
+        pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
+        if len(pedestrians) >= 8:  # 需要足夠的行人來形成十字路口
+            # 抓取行人位置
+            positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
+            # 分析 x 和 y 坐標分佈
+            x_coords = [pos[0] for pos in positions]
+            y_coords = [pos[1] for pos in positions]
+            # 計算 x 和 y 坐標的變異數
+            x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
+            y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
+            # 計算範圍
+            x_range = max(x_coords) - min(x_coords)
+            y_range = max(y_coords) - min(y_coords)
+            # 如果 x 和 y 方向都有較大範圍且範圍相似，那就有可能是十字路口
+            if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
+                return True
+        return False
+    def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
+        """
+        Generate a generic description when scene type is unknown or confidence is very low.
+        Args:
+            detected_objects: List of detected objects
+            lighting_info: Optional lighting condition information
+        Returns:
+            str: Generic description based on detected objects
+        """
+        # Count object occurrences
+        obj_counts = {}
+        for obj in detected_objects:
+            class_name = obj["class_name"]
+            if class_name not in obj_counts:
+                obj_counts[class_name] = 0
+            obj_counts[class_name] += 1
+        # Get top objects by count
+        top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]
+        if not top_objects:
+            base_desc = "No clearly identifiable objects are visible in this scene."
+        else:
+            # Format object list
+            objects_text = []
+            for name, count in top_objects:
+                if count > 1:
+                    objects_text.append(f"{count} {name}s")
+                else:
+                    objects_text.append(name)
+            if len(objects_text) == 1:
+                objects_list = objects_text[0]
+            elif len(objects_text) == 2:
+                objects_list = f"{objects_text[0]} and {objects_text[1]}"
+            else:
+                objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"
+            base_desc = f"This scene contains {objects_list}."
+        # Add lighting information if available
+        if lighting_info and "time_of_day" in lighting_info:
+            lighting_type = lighting_info["time_of_day"]
+            if lighting_type in self.templates.get("lighting_templates", {}):
+                lighting_desc = self.templates["lighting_templates"][lighting_type]
+                base_desc += f" {lighting_desc}"
+        return base_desc
+    def _generate_scene_details(self,
+                              scene_type: str,
+                              detected_objects: List[Dict],
+                              lighting_info: Optional[Dict] = None,
+                              viewpoint: str = "eye_level") -> str:
+        """
+        Generate detailed description based on scene type and detected objects.
+        Args:
+            scene_type: Identified scene type
+            detected_objects: List of detected objects
+            lighting_info: Optional lighting condition information
+            viewpoint: Detected viewpoint (aerial, eye_level, etc.)
+        Returns:
+            str: Detailed scene description
+        """
+        # Get scene-specific templates
+        scene_details = ""
+        scene_templates = self.templates.get("scene_detail_templates", {})
+        # Handle specific scene types
+        if scene_type in scene_templates:
+            # Select a template appropriate for the viewpoint if available
+            viewpoint_key = f"{scene_type}_{viewpoint}"
+            if viewpoint_key in scene_templates:
+                # We have a viewpoint-specific template
+                templates_list = scene_templates[viewpoint_key]
+            else:
+                # Fall back to general templates for this scene type
+                templates_list = scene_templates[scene_type]
+            # Select a random template from the list
+            if templates_list:
+                detail_template = random.choice(templates_list)
+                # Fill the template with object information
+                scene_details = self._fill_detail_template(
+                    detail_template,
+                    detected_objects,
+                    scene_type
+                )
+        else:
+            # Use default templates if specific ones aren't available
+            if "default" in scene_templates:
+                detail_template = random.choice(scene_templates["default"])
+                scene_details = self._fill_detail_template(
+                    detail_template,
+                    detected_objects,
+                    "default"
+                )
+            else:
+                # Fall back to basic description if no templates are available
+                scene_details = self._generate_basic_details(scene_type, detected_objects)
+        return scene_details
+    def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str:
+        """
+        Fill a template with specific details based on detected objects.
+        Args:
+            template: Template string with placeholders
+            detected_objects: List of detected objects
+            scene_type: Identified scene type
+        Returns:
+            str: Filled template
+        """
+        # Find placeholders in the template using simple {placeholder} syntax
+        import re
+        placeholders = re.findall(r'\{([^}]+)\}', template)
+        filled_template = template
+        # Get object template fillers
+        fillers = self.templates.get("object_template_fillers", {})
+        # 為所有可能的變數設置默認值
+        default_replacements = {
+            # 室內相關
+            "furniture": "various furniture pieces",
+            "seating": "comfortable seating",
+            "electronics": "entertainment devices",
+            "bed_type": "a bed",
+            "bed_location": "room",
+            "bed_description": "sleeping arrangements",
+            "extras": "personal items",
+            "table_setup": "a dining table and chairs",
+            "table_description": "a dining surface",
+            "dining_items": "dining furniture and tableware",
+            "appliances": "kitchen appliances",
+            "kitchen_items": "cooking utensils and dishware",
+            "cooking_equipment": "cooking equipment",
+            "office_equipment": "work-related furniture and devices",
+            "desk_setup": "a desk and chair",
+            "computer_equipment": "electronic devices",
+            # 室外/城市相關
+            "traffic_description": "vehicles and pedestrians",
+            "people_and_vehicles": "people and various vehicles",
+            "street_elements": "urban infrastructure",
+            "park_features": "benches and greenery",
+            "outdoor_elements": "natural features",
+            "park_description": "outdoor amenities",
+            "store_elements": "merchandise displays",
+            "shopping_activity": "customers browse and shop",
+            "store_items": "products for sale",
+            # 高級餐廳相關
+            "design_elements": "elegant decor",
+            "lighting": "stylish lighting fixtures",
+            # 亞洲商業街相關
+            "storefront_features": "compact shops",
+            "pedestrian_flow": "people walking",
+            "asian_elements": "distinctive cultural elements",
+            "cultural_elements": "traditional design features",
+            "signage": "colorful signs",
+            "street_activities": "busy urban activity",
+            # 金融區相關
+            "buildings": "tall buildings",
+            "traffic_elements": "vehicles",
+            "skyscrapers": "high-rise buildings",
+            "road_features": "wide streets",
+            "architectural_elements": "modern architecture",
+            "city_landmarks": "prominent structures",
+            # 十字路口相關
+            "crossing_pattern": "marked pedestrian crossings",
+            "pedestrian_behavior": "careful walking",
+            "pedestrian_density": "groups of pedestrians",
+            "traffic_pattern": "regulated traffic flow",
+            # 交通樞紐相關
+            "transit_vehicles": "public transportation vehicles",
+            "passenger_activity": "commuter movement",
+            "transportation_modes": "various transit options",
+            "passenger_needs": "waiting areas",
+            "transit_infrastructure": "transit facilities",
+            "passenger_movement": "commuter flow",
+            # 購物區相關
+            "retail_elements": "shops and displays",
+            "store_types": "various retail establishments",
+            "walkway_features": "pedestrian pathways",
+            "commercial_signage": "store signs",
+            "consumer_behavior": "shopping activities",
+            # 空中視角相關
+            "commercial_layout": "organized retail areas",
+            "pedestrian_pattern": "people movement patterns",
+            "gathering_features": "public gathering spaces",
+            "movement_pattern": "crowd flow patterns",
+            "urban_elements": "city infrastructure",
+            "public_activity": "social interaction",
+            # 文化特定元素
+            "stall_elements": "vendor booths",
+            "lighting_features": "decorative lights",
+            "food_elements": "food offerings",
+            "vendor_stalls": "market stalls",
+            "nighttime_activity": "evening commerce",
+            "cultural_lighting": "traditional lighting",
+            "night_market_sounds": "lively market sounds",
+            "evening_crowd_behavior": "nighttime social activity",
+            "architectural_elements": "cultural buildings",
+            "religious_structures": "sacred buildings",
+            "decorative_features": "ornamental designs",
+            "cultural_practices": "traditional activities",
+            "temple_architecture": "religious structures",
+            "sensory_elements": "atmospheric elements",
+            "visitor_activities": "cultural experiences",
+            "ritual_activities": "ceremonial practices",
+            "cultural_symbols": "meaningful symbols",
+            "architectural_style": "historical buildings",
+            "historic_elements": "traditional architecture",
+            "urban_design": "city planning elements",
+            "social_behaviors": "public interactions",
+            "european_features": "European architectural details",
+            "tourist_activities": "visitor activities",
+            "local_customs": "regional practices",
+            # 時間特定元素
+            "lighting_effects": "artificial lighting",
+            "shadow_patterns": "light and shadow",
+            "urban_features": "city elements",
+            "illuminated_elements": "lit structures",
+            "evening_activities": "nighttime activities",
+            "light_sources": "lighting points",
+            "lit_areas": "illuminated spaces",
+            "shadowed_zones": "darker areas",
+            "illuminated_signage": "bright signs",
+            "colorful_lighting": "multicolored lights",
+            "neon_elements": "neon signs",
+            "night_crowd_behavior": "evening social patterns",
+            "light_displays": "lighting installations",
+            "building_features": "architectural elements",
+            "nightlife_activities": "evening entertainment",
+            "lighting_modifier": "bright",
+            # 混合環境元素
+            "transitional_elements": "connecting features",
+            "indoor_features": "interior elements",
+            "outdoor_setting": "exterior spaces",
+            "interior_amenities": "inside comforts",
+            "exterior_features": "outside elements",
+            "inside_elements": "interior design",
+            "outside_spaces": "outdoor areas",
+            "dual_environment_benefits": "combined settings",
+            "passenger_activities": "waiting behaviors",
+            "transportation_types": "transit vehicles",
+            "sheltered_elements": "covered areas",
+            "exposed_areas": "open sections",
+            "waiting_behaviors": "passenger activities",
+            "indoor_facilities": "inside services",
+            "platform_features": "transit platform elements",
+            "transit_routines": "transportation procedures",
+            # 專門場所元素
+            "seating_arrangement": "spectator seating",
+            "playing_surface": "athletic field",
+            "sporting_activities": "sports events",
+            "spectator_facilities": "viewer accommodations",
+            "competition_space": "sports arena",
+            "sports_events": "athletic competitions",
+            "viewing_areas": "audience sections",
+            "field_elements": "field markings and equipment",
+            "game_activities": "competitive play",
+            "construction_equipment": "building machinery",
+            "building_materials": "construction supplies",
+            "construction_activities": "building work",
+            "work_elements": "construction tools",
+            "structural_components": "building structures",
+            "site_equipment": "construction gear",
+            "raw_materials": "building supplies",
+            "construction_process": "building phases",
+            "medical_elements": "healthcare equipment",
+            "clinical_activities": "medical procedures",
+            "facility_design": "healthcare layout",
+            "healthcare_features": "medical facilities",
+            "patient_interactions": "care activities",
+            "equipment_types": "medical devices",
+            "care_procedures": "health services",
+            "treatment_spaces": "clinical areas",
+            "educational_furniture": "learning furniture",
+            "learning_activities": "educational practices",
+            "instructional_design": "teaching layout",
+            "classroom_elements": "school equipment",
+            "teaching_methods": "educational approaches",
+            "student_engagement": "learning participation",
+            "learning_spaces": "educational areas",
+            "educational_tools": "teaching resources",
+            "knowledge_transfer": "learning exchanges"
+        }
+        # For each placeholder, try to fill with appropriate content
+        for placeholder in placeholders:
+            if placeholder in fillers:
+                # Get random filler for this placeholder
+                options = fillers[placeholder]
+                if options:
+                    # Select 1-3 items from the options list
+                    num_items = min(len(options), random.randint(1, 3))
+                    selected_items = random.sample(options, num_items)
+                    # Create a formatted list
+                    if len(selected_items) == 1:
+                        replacement = selected_items[0]
+                    elif len(selected_items) == 2:
+                        replacement = f"{selected_items[0]} and {selected_items[1]}"
+                    else:
+                        replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}"
+                    # Replace the placeholder
+                    filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
+            else:
+                # Try to fill with scene-specific logic
+                replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type)
+                if replacement:
+                    filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
+                elif placeholder in default_replacements:
+                    # Use default replacement if available
+                    filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder])
+                else:
+                    # Last resort default
+                    filled_template = filled_template.replace(f"{{{placeholder}}}", "various items")
+        return filled_template
+    def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
+        """
+        Generate content for a template placeholder based on scene-specific logic.
+        Args:
+            placeholder: Template placeholder
+            detected_objects: List of detected objects
+            scene_type: Identified scene type
+        Returns:
+            str: Content for the placeholder
+        """
+        # Handle different types of placeholders with custom logic
+        if placeholder == "furniture":
+            # Extract furniture items
+            furniture_ids = [56, 57, 58, 59, 60, 61]  # Example furniture IDs
+            furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids]
+            if furniture_objects:
+                furniture_names = [obj["class_name"] for obj in furniture_objects[:3]]
+                return ", ".join(set(furniture_names))
+            return "various furniture items"
+        elif placeholder == "electronics":
+            # Extract electronic items
+            electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70]  # Example electronics IDs
+            electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids]
+            if electronics_objects:
+                electronics_names = [obj["class_name"] for obj in electronics_objects[:3]]
+                return ", ".join(set(electronics_names))
+            return "electronic devices"
+        elif placeholder == "people_count":
+            # Count people
+            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
+            if people_count == 0:
+                return "no people"
+            elif people_count == 1:
+                return "one person"
+            elif people_count < 5:
+                return f"{people_count} people"
+            else:
+                return "several people"
+        elif placeholder == "seating":
+            # Extract seating items
+            seating_ids = [56, 57]  # chair, sofa
+            seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids]
+            if seating_objects:
+                seating_names = [obj["class_name"] for obj in seating_objects[:2]]
+                return ", ".join(set(seating_names))
+            return "seating arrangements"
+        # Default case - empty string
+        return ""
+    def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
+        """
+        Generate basic details when templates aren't available.
+        Args:
+            scene_type: Identified scene type
+            detected_objects: List of detected objects
+        Returns:
+            str: Basic scene details
+        """
+        # Handle specific scene types with custom logic
+        if scene_type == "living_room":
+            tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62]  # TV
+            sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57]  # Sofa
+            if tv_objs and sofa_objs:
+                tv_region = tv_objs[0]["region"]
+                sofa_region = sofa_objs[0]["region"]
+                arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
+                arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "
+                return f"{arrangement}This appears to be a space designed for relaxation and entertainment."
+        elif scene_type == "bedroom":
+            bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59]  # Bed
+            if bed_objs:
+                bed_region = bed_objs[0]["region"]
+                extra_items = []
+                for obj in detected_objects:
+                    if obj["class_id"] == 74:  # Clock
+                        extra_items.append("clock")
+                    elif obj["class_id"] == 73:  # Book
+                        extra_items.append("book")
+                extras = ""
+                if extra_items:
+                    extras = f" There is also a {' and a '.join(extra_items)} visible."
+                return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"
+        elif scene_type in ["dining_area", "kitchen"]:
+            # Count food and dining-related items
+            food_items = []
+            for obj in detected_objects:
+                if obj["class_id"] in [39, 41, 42, 43, 44, 45]:  # Kitchen items
+                    food_items.append(obj["class_name"])
+            food_str = ""
+            if food_items:
+                unique_items = list(set(food_items))
+                if len(unique_items) <= 3:
+                    food_str = f" with {', '.join(unique_items)}"
+                else:
+                    food_str = f" with {', '.join(unique_items[:3])} and other items"
+            return f"{food_str}."
+        elif scene_type == "city_street":
+            # Count people and vehicles
+            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
+            vehicle_count = len([obj for obj in detected_objects
+                               if obj["class_id"] in [1, 2, 3, 5, 7]])  # Bicycle, car, motorbike, bus, truck
+            traffic_desc = ""
+            if people_count > 0 and vehicle_count > 0:
+                traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
+                traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+            elif people_count > 0:
+                traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
+            elif vehicle_count > 0:
+                traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+            return f"{traffic_desc}."
+        # Handle more specialized scenes
+        elif scene_type == "asian_commercial_street":
+            # Look for key urban elements
+            people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
+            vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]])
+            # Analyze pedestrian distribution
+            people_positions = []
+            for obj in detected_objects:
+                if obj["class_id"] == 0:  # Person
+                    people_positions.append(obj["normalized_center"])
+            # Check if people are distributed along a line (indicating a walking path)
+            structured_path = False
+            if len(people_positions) >= 3:
+                # Simplified check - see if y-coordinates are similar for multiple people
+                y_coords = [pos[1] for pos in people_positions]
+                y_mean = sum(y_coords) / len(y_coords)
+                y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
+                if y_variance < 0.05:  # Low variance indicates linear arrangement
+                    structured_path = True
+            street_desc = "A commercial street with "
+            if people_count > 0:
+                street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
+                if vehicle_count > 0:
+                    street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+            elif vehicle_count > 0:
+                street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
+            else:
+                street_desc += "various commercial elements"
+            if structured_path:
+                street_desc += ". The pedestrians appear to be following a defined walking path"
+            # Add cultural elements
+            street_desc += ". The signage and architectural elements suggest an Asian urban setting."
+            return street_desc
+        # Default general description
+        return "The scene contains various elements characteristic of this environment."
+    def _detect_viewpoint(self, detected_objects: List[Dict]) -> str:
+        """
+        改進視角檢測，特別加強對空中俯視視角的識別。
+        Args:
+            detected_objects: 檢測到的物體列表
+        Returns:
+            str: 檢測到的視角類型
+        """
+        if not detected_objects:
+            return "eye_level"  # default
+        # 提取物體位置和大小
+        top_region_count = 0
+        bottom_region_count = 0
+        total_objects = len(detected_objects)
+        # 追蹤大小分布以檢測空中視角
+        sizes = []
+        # 垂直大小比例用於低角度檢測
+        height_width_ratios = []
+        # 用於檢測規則圖案的變數
+        people_positions = []
+        crosswalk_pattern_detected = False
+        for obj in detected_objects:
+            # 計算頂部/底部區域中的物體
+            region = obj["region"]
+            if "top" in region:
+                top_region_count += 1
+            elif "bottom" in region:
+                bottom_region_count += 1
+            # 計算標準化大小（面積）
+            if "normalized_area" in obj:
+                sizes.append(obj["normalized_area"])
+            # 計算高度/寬度比例
+            if "normalized_size" in obj:
+                width, height = obj["normalized_size"]
+                if width > 0:
+                    height_width_ratios.append(height / width)
+            # 收集人的位置用於圖案檢測
+            if obj["class_id"] == 0:  # 人
+                if "normalized_center" in obj:
+                    people_positions.append(obj["normalized_center"])
+        # 專門為斑馬線十字路口添加檢測邏輯
+        # 檢查是否有明顯的垂直和水平行人分布
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # 人
+        if len(people_objs) >= 8:  # 需要足夠多的人才能形成十字路口模式
+            # 檢查是否有斑馬線模式 - 新增功能
+            if len(people_positions) >= 4:
+                # 對位置進行聚類分析，尋找線性分布
+                x_coords = [pos[0] for pos in people_positions]
+                y_coords = [pos[1] for pos in people_positions]
+                # 計算 x 和 y 坐標的變異數和範圍
+                x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
+                y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
+                x_range = max(x_coords) - min(x_coords)
+                y_range = max(y_coords) - min(y_coords)
+                # 嘗試檢測十字形分布
+                # 如果 x 和 y 方向都有較大範圍，且範圍相似，可能是十字路口
+                if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
+                    # 計算到中心點的距離
+                    center_x = np.mean(x_coords)
+                    center_y = np.mean(y_coords)
+                    # 將點映射到十字架的軸上（水平和垂直）
+                    x_axis_distance = [abs(x - center_x) for x in x_coords]
+                    y_axis_distance = [abs(y - center_y) for y in y_coords]
+                    # 點應該接近軸線（水平或垂直）
+                    # 對於每個點，檢查它是否接近水平或垂直軸線
+                    close_to_axis_count = 0
+                    for i in range(len(x_coords)):
+                        if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1:
+                            close_to_axis_count += 1
+                    # 如果足夠多的點接近軸線，認為是十字路口
+                    if close_to_axis_count >= len(x_coords) * 0.6:
+                        crosswalk_pattern_detected = True
+                # 如果沒有檢測到十字形，嘗試檢測線性聚類分布
+                if not crosswalk_pattern_detected:
+                    # 檢查 x 和 y 方向的聚類
+                    x_clusters = self._detect_linear_clusters(x_coords)
+                    y_clusters = self._detect_linear_clusters(y_coords)
+                    # 如果在 x 和 y 方向上都有多個聚類，可能是交叉的斑馬線
+                    if len(x_clusters) >= 2 and len(y_clusters) >= 2:
+                        crosswalk_pattern_detected = True
+        # 檢測斑馬線模式 - 優先判斷
+        if crosswalk_pattern_detected:
+            return "aerial"
+        # 檢測行人分布情況
+        if len(people_objs) >= 10:
+            people_region_counts = {}
+            for obj in people_objs:
+                region = obj["region"]
+                if region not in people_region_counts:
+                    people_region_counts[region] = 0
+                people_region_counts[region] += 1
+            # 計算不同區域中的行人數量
+            region_count = len([r for r, c in people_region_counts.items() if c >= 2])
+            # 如果行人分布在多個區域中，可能是空中視角
+            if region_count >= 4:
+                # 檢查行人分布的模式
+                # 特別是檢查不同區域中行人數量的差異
+                region_counts = list(people_region_counts.values())
+                region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0
+                region_counts_mean = np.mean(region_counts) if region_counts else 0
+                # 如果行人分布較為均勻（變異係數小），可能是空中視角
+                if region_counts_mean > 0:
+                    variation_coefficient = region_counts_variance / region_counts_mean
+                    if variation_coefficient < 0.5:
+                        return "aerial"
+        # 計算指標
+        top_ratio = top_region_count / total_objects if total_objects > 0 else 0
+        bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0
+        # 大小變異數（標準化）
+        size_variance = 0
+        if sizes:
+            mean_size = sum(sizes) / len(sizes)
+            size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes)
+            size_variance = size_variance / (mean_size ** 2)  # 標準化
+        # 平均高度/寬度比例
+        avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0
+        # 空中視角：低大小差異，物體均勻分布，底部很少或沒有物體
+        if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and
+            bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]):
+            return "aerial"
+        # 低角度視角：物體傾向於比寬高，頂部較多物體
+        elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and
+            top_ratio > self.viewpoint_params["low_angle_threshold"]):
+            return "low_angle"
+        # 高視角：底部較多物體，頂部較少
+        elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and
+            top_ratio < self.viewpoint_params["elevated_top_threshold"]):
+            return "elevated"
+        # 默認：平視角
+        return "eye_level"
+    def _detect_linear_clusters(self, coords, threshold=0.05):
+        """
+        檢測坐標中的線性聚類
+        Args:
+            coords: 一維坐標列表
+            threshold: 聚類閾值
+        Returns:
+            list: 聚類列表
+        """
+        if not coords:
+            return []
+        # 排序坐標
+        sorted_coords = sorted(coords)
+        clusters = []
+        current_cluster = [sorted_coords[0]]
+        for i in range(1, len(sorted_coords)):
+            # 如果當前坐標與前一個接近，添加到當前聚類
+            if sorted_coords[i] - sorted_coords[i-1] < threshold:
+                current_cluster.append(sorted_coords[i])
+            else:
+                # 否則開始新的聚類
+                if len(current_cluster) >= 2:  # 至少需要2個點形成聚類
+                    clusters.append(current_cluster)
+                current_cluster = [sorted_coords[i]]
+        # 添加最後一個cluster
+        if len(current_cluster) >= 2:
+            clusters.append(current_cluster)
+        return clusters
+    def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
+        """
+        Detect the likely cultural context of the scene.
+        Args:
+            scene_type: Identified scene type
+            detected_objects: List of detected objects
+        Returns:
+            Optional[str]: Detected cultural context (asian, european, etc.) or None
+        """
+        # Scene types with explicit cultural contexts
+        cultural_scene_mapping = {
+            "asian_commercial_street": "asian",
+            "asian_night_market": "asian",
+            "asian_temple_area": "asian",
+            "european_plaza": "european"
+        }
+        # Check if scene type directly indicates cultural context
+        if scene_type in cultural_scene_mapping:
+            return cultural_scene_mapping[scene_type]
+        # No specific cultural context detected
+        return None
+    def _generate_cultural_elements(self, cultural_context: str) -> str:
+        """
+        Generate description of cultural elements for the detected context.
+        Args:
+            cultural_context: Detected cultural context
+        Returns:
+            str: Description of cultural elements
+        """
+        # Get template for this cultural context
+        cultural_templates = self.templates.get("cultural_templates", {})
+        if cultural_context in cultural_templates:
+            template = cultural_templates[cultural_context]
+            elements = template.get("elements", [])
+            if elements:
+                # Select 1-2 random elements
+                num_elements = min(len(elements), random.randint(1, 2))
+                selected_elements = random.sample(elements, num_elements)
+                # Format elements list
+                elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0]
+                # Fill template
+                return template.get("description", "").format(elements=elements_text)
+        return ""
+    def _optimize_object_description(self, description: str) -> str:
+        """
+        優化物品描述，避免重複列舉相同物品
+        """
+        import re
+        # 處理床鋪重複描述
+        if "bed in the room" in description:
+            description = description.replace("a bed in the room", "a bed")
+        # 處理重複的物品列表
+        # 尋找格式如 "item, item, item" 的模式
+        object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
+        for obj_list in object_lists:
+            # 計算每個物品出現次數
+            items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
+            item_counts = {}
+            for item in items:
+                item = item.strip()
+                if item and item not in ["and", "with"]:
+                    if item not in item_counts:
+                        item_counts[item] = 0
+                    item_counts[item] += 1
+            # 生成優化後的物品列表
+            if item_counts:
+                new_items = []
+                for item, count in item_counts.items():
+                    if count > 1:
+                        new_items.append(f"{count} {item}s")
+                    else:
+                        new_items.append(item)
+                # 格式化新列表
+                if len(new_items) == 1:
+                    new_list = new_items[0]
+                elif len(new_items) == 2:
+                    new_list = f"{new_items[0]} and {new_items[1]}"
+                else:
+                    new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
+                # 替換原始列表
+                description = description.replace(obj_list, new_list)
+        return description
+    def _describe_functional_zones(self, functional_zones: Dict) -> str:
+        """
+        生成場景功能區域的描述，優化處理行人區域、人數統計和物品重複問題。
+        Args:
+            functional_zones: 識別出的功能區域字典
+        Returns:
+            str: 功能區域描述
+        """
+        if not functional_zones:
+            return ""
+        # 計算場景中的總人數
+        total_people_count = 0
+        people_by_zone = {}
+        # 計算每個區域的人數並累計總人數
+        for zone_name, zone_info in functional_zones.items():
+            if "objects" in zone_info:
+                zone_people_count = zone_info["objects"].count("person")
+                people_by_zone[zone_name] = zone_people_count
+                total_people_count += zone_people_count
+        # 分類區域為行人區域和其他區域
+        pedestrian_zones = []
+        other_zones = []
+        for zone_name, zone_info in functional_zones.items():
+            # 檢查是否是行人相關區域
+            if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
+                pedestrian_zones.append((zone_name, zone_info))
+            else:
+                other_zones.append((zone_name, zone_info))
+        # 獲取最重要的行人區域和其他區域
+        main_pedestrian_zones = sorted(pedestrian_zones,
+                                    key=lambda z: people_by_zone.get(z[0], 0),
+                                    reverse=True)[:1]  # 最多1個主要行人區域
+        top_other_zones = sorted(other_zones,
+                            key=lambda z: len(z[1].get("objects", [])),
+                            reverse=True)[:2]  # 最多2個其他區域
+        # 合併區域
+        top_zones = main_pedestrian_zones + top_other_zones
+        if not top_zones:
+            return ""
+        # 生成匯總描述
+        summary = ""
+        max_mentioned_people = 0  # 跟踪已經提到的最大人數
+        # 如果總人數顯著且還沒在主描述中提到，添加總人數描述
+        if total_people_count > 5:
+            summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
+            max_mentioned_people = total_people_count  # 更新已提到的最大人數
+        # 處理每個區域的描述，確保人數信息的一致性
+        processed_zones = []
+        for zone_name, zone_info in top_zones:
+            zone_desc = zone_info.get("description", "a functional zone")
+            zone_people_count = people_by_zone.get(zone_name, 0)
+            # 檢查描述中是否包含人數信息
+            contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
+            # 如果描述包含人數信息，且人數較小（小於已提到的最大人數），則修改描述
+            if contains_people_info and zone_people_count < max_mentioned_people:
+                parts = zone_desc.split("with")
+                if len(parts) > 1:
+                    # 移除人數部分
+                    zone_desc = parts[0].strip() + " area"
+            processed_zones.append((zone_name, {"description": zone_desc}))
+        # 根據處理後的區域數量生成最終描述
+        final_desc = ""
+        if len(processed_zones) == 1:
+            _, zone_info = processed_zones[0]
+            zone_desc = zone_info["description"]
+            final_desc = summary + f"The scene includes {zone_desc}."
+        elif len(processed_zones) == 2:
+            _, zone1_info = processed_zones[0]
+            _, zone2_info = processed_zones[1]
+            zone1_desc = zone1_info["description"]
+            zone2_desc = zone2_info["description"]
+            final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
+        else:
+            zones_desc = ["The scene contains multiple functional areas including"]
+            zone_descriptions = [z[1]["description"] for z in processed_zones]
+            # 格式化最終的多區域描述
+            if len(zone_descriptions) == 3:
+                formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
+            else:
+                formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"
+            final_desc = summary + f"{zones_desc[0]} {formatted_desc}."
+        return self._optimize_object_description(final_desc)

image_processor.py CHANGED Viewed

@@ -11,64 +11,125 @@ from detection_model import DetectionModel
 from color_mapper import ColorMapper
 from visualization_helper import VisualizationHelper
 from evaluation_metrics import EvaluationMetrics
 class ImageProcessor:
     """
     Class for handling image processing and object detection operations
     Separates processing logic from UI components
     """
     def __init__(self):
         """Initialize the image processor with required components"""
         self.color_mapper = ColorMapper()
         self.model_instances = {}
-    def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.35) -> DetectionModel:
         """
         Get or create a model instance based on model name
         Args:
             model_name: Name of the model to use
             confidence: Confidence threshold for detection
             iou: IoU threshold for non-maximum suppression
         Returns:
             DetectionModel instance
         """
         if model_name not in self.model_instances:
             print(f"Creating new model instance for {model_name}")
             self.model_instances[model_name] = DetectionModel(
-                model_name=model_name,
-                confidence=confidence,
                 iou=iou
             )
         else:
             print(f"Using existing model instance for {model_name}")
             self.model_instances[model_name].confidence = confidence
         return self.model_instances[model_name]
     def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
         """
         Process an image for object detection
         Args:
             image: Input image (numpy array or PIL Image)
             model_name: Name of the model to use
             confidence_threshold: Confidence threshold for detection
             filter_classes: Optional list of classes to filter results
         Returns:
             Tuple of (result_image, result_text, stats_data)
         """
         # Get model instance
         model_instance = self.get_model_instance(model_name, confidence_threshold)
         # Initialize key variables
         result = None
         stats = {}
         temp_path = None
         try:
             # Processing input image
             if isinstance(image, np.ndarray):
@@ -82,44 +143,51 @@ class ImageProcessor:
                 return None, "No image provided. Please upload an image.", {}
             else:
                 pil_image = image
             # Store temp files
             temp_dir = tempfile.gettempdir()  # Use system temp directory
             temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
             temp_path = os.path.join(temp_dir, temp_filename)
             pil_image.save(temp_path)
             # Object detection
             result = model_instance.detect(temp_path)
             if result is None:
                 return None, "Detection failed. Please try again with a different image.", {}
             # Calculate stats
             stats = EvaluationMetrics.calculate_basic_stats(result)
             # Add space calculation
             spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
             stats["spatial_metrics"] = spatial_metrics
             # Apply filter if specified
             if filter_classes and len(filter_classes) > 0:
                 # Get classes, boxes, confidence
                 classes = result.boxes.cls.cpu().numpy().astype(int)
                 confs = result.boxes.conf.cpu().numpy()
                 boxes = result.boxes.xyxy.cpu().numpy()
                 mask = np.zeros_like(classes, dtype=bool)
                 for cls_id in filter_classes:
                     mask = np.logical_or(mask, classes == cls_id)
                 filtered_stats = {
                     "total_objects": int(np.sum(mask)),
                     "class_statistics": {},
                     "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
-                    "spatial_metrics": stats["spatial_metrics"]
                 }
                 # Update stats
                 names = result.names
                 for cls, conf in zip(classes[mask], confs[mask]):
@@ -129,59 +197,67 @@ class ImageProcessor:
                             "count": 0,
                             "average_confidence": 0
                         }
                     filtered_stats["class_statistics"][cls_name]["count"] += 1
                     filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
                 stats = filtered_stats
             viz_data = EvaluationMetrics.generate_visualization_data(
                 result,
                 self.color_mapper.get_all_colors()
             )
             result_image = VisualizationHelper.visualize_detection(
                 temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
             )
             result_text = EvaluationMetrics.format_detection_summary(viz_data)
             return result_image, result_text, stats
         except Exception as e:
             error_message = f"Error Occurs: {str(e)}"
             import traceback
             traceback.print_exc()
             print(error_message)
             return None, error_message, {}
         finally:
             if temp_path and os.path.exists(temp_path):
                 try:
                     os.remove(temp_path)
                 except Exception as e:
                     print(f"Cannot delete temp files {temp_path}: {str(e)}")
     def format_result_text(self, stats: Dict) -> str:
         """
         Format detection statistics into readable text with improved spacing
         Args:
             stats: Dictionary containing detection statistics
         Returns:
             Formatted text summary
         """
         if not stats or "total_objects" not in stats:
             return "No objects detected."
         # 減少不必要的空行
         lines = [
             f"Detected {stats['total_objects']} objects.",
             f"Average confidence: {stats.get('average_confidence', 0):.2f}",
             "Objects by class:"
         ]
         if "class_statistics" in stats and stats["class_statistics"]:
             # 按計數排序類別
             sorted_classes = sorted(
@@ -189,24 +265,24 @@ class ImageProcessor:
                 key=lambda x: x[1]["count"],
                 reverse=True
             )
             for cls_name, cls_stats in sorted_classes:
                 count = cls_stats["count"]
                 conf = cls_stats.get("average_confidence", 0)
                 item_text = "item" if count == 1 else "items"
                 lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
         else:
             lines.append("No class information available.")
         # 添加空間信息
         if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
             lines.append("Object Distribution:")
             dist = stats["spatial_metrics"]["spatial_distribution"]
             x_mean = dist.get("x_mean", 0)
             y_mean = dist.get("y_mean", 0)
             # 描述物體的大致位置
             if x_mean < 0.33:
                 h_pos = "on the left side"
@@ -214,37 +290,37 @@ class ImageProcessor:
                 h_pos = "in the center"
             else:
                 h_pos = "on the right side"
             if y_mean < 0.33:
                 v_pos = "in the upper part"
             elif y_mean < 0.67:
                 v_pos = "in the middle"
             else:
                 v_pos = "in the lower part"
             lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")
         return "\n".join(lines)
     def format_json_for_display(self, stats: Dict) -> Dict:
         """
         Format statistics JSON for better display
         Args:
             stats: Raw statistics dictionary
         Returns:
             Formatted statistics structure for display
         """
         # Create a cleaner copy of the stats for display
         display_stats = {}
         # Add summary section
         display_stats["summary"] = {
             "total_objects": stats.get("total_objects", 0),
             "average_confidence": round(stats.get("average_confidence", 0), 3)
         }
         # Add class statistics in a more organized way
         if "class_statistics" in stats and stats["class_statistics"]:
             # Sort classes by count (descending)
@@ -253,20 +329,20 @@ class ImageProcessor:
                 key=lambda x: x[1].get("count", 0),
                 reverse=True
             )
             class_stats = {}
             for cls_name, cls_data in sorted_classes:
                 class_stats[cls_name] = {
                     "count": cls_data.get("count", 0),
                     "average_confidence": round(cls_data.get("average_confidence", 0), 3)
                 }
             display_stats["detected_objects"] = class_stats
         # Simplify spatial metrics
         if "spatial_metrics" in stats:
             spatial = stats["spatial_metrics"]
             # Simplify spatial distribution
             if "spatial_distribution" in spatial:
                 dist = spatial["spatial_distribution"]
@@ -278,7 +354,7 @@ class ImageProcessor:
                         "y_std": round(dist.get("y_std", 0), 3)
                     }
                 }
             # Add simplified size information
             if "size_distribution" in spatial:
                 size = spatial["size_distribution"]
@@ -287,30 +363,30 @@ class ImageProcessor:
                     "min_area": round(size.get("min_area", 0), 3),
                     "max_area": round(size.get("max_area", 0), 3)
                 }
         return display_stats
     def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
         """
         Prepare data for visualization based on detection statistics
         Args:
             stats: Detection statistics
             available_classes: Dictionary of available class IDs and names
         Returns:
             Visualization data dictionary
         """
         if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
             return {"error": "No detection data available"}
         # Prepare visualization data
         viz_data = {
             "total_objects": stats.get("total_objects", 0),
             "average_confidence": stats.get("average_confidence", 0),
             "class_data": []
         }
         # Class data
         for cls_name, cls_stats in stats.get("class_statistics", {}).items():
             # Search class ID
@@ -319,7 +395,7 @@ class ImageProcessor:
                 if name == cls_name:
                     class_id = id
                     break
             cls_data = {
                 "name": cls_name,
                 "class_id": class_id,
@@ -327,10 +403,10 @@ class ImageProcessor:
                 "average_confidence": cls_stats.get("average_confidence", 0),
                 "color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
             }
             viz_data["class_data"].append(cls_data)
         # Descending order
         viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)
         return viz_data

 from color_mapper import ColorMapper
 from visualization_helper import VisualizationHelper
 from evaluation_metrics import EvaluationMetrics
+from lighting_analyzer import LightingAnalyzer
+from scene_analyzer import SceneAnalyzer
 class ImageProcessor:
     """
     Class for handling image processing and object detection operations
     Separates processing logic from UI components
     """
     def __init__(self):
         """Initialize the image processor with required components"""
         self.color_mapper = ColorMapper()
         self.model_instances = {}
+        self.lighting_analyzer = LightingAnalyzer()
+    def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
         """
         Get or create a model instance based on model name
         Args:
             model_name: Name of the model to use
             confidence: Confidence threshold for detection
             iou: IoU threshold for non-maximum suppression
         Returns:
             DetectionModel instance
         """
         if model_name not in self.model_instances:
             print(f"Creating new model instance for {model_name}")
             self.model_instances[model_name] = DetectionModel(
+                model_name=model_name,
+                confidence=confidence,
                 iou=iou
             )
         else:
             print(f"Using existing model instance for {model_name}")
             self.model_instances[model_name].confidence = confidence
         return self.model_instances[model_name]
+    def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None) -> Dict:
+        """
+        Perform scene analysis on detection results
+        Args:
+            detection_result: Object detection result from YOLOv8
+            lighting_info: Lighting condition analysis results (optional)
+        Returns:
+            Dictionary containing scene analysis results
+        """
+        try:
+            # Initialize scene analyzer if not already done
+            if not hasattr(self, 'scene_analyzer'):
+                self.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
+            # 確保類名正確更新
+            if self.scene_analyzer.class_names is None:
+                self.scene_analyzer.class_names = detection_result.names
+                self.scene_analyzer.spatial_analyzer.class_names = detection_result.names
+            # Perform scene analysis with lighting info
+            scene_analysis = self.scene_analyzer.analyze(
+                detection_result=detection_result,
+                lighting_info=lighting_info,
+                class_confidence_threshold=0.35,
+                scene_confidence_threshold=0.6
+            )
+            return scene_analysis
+        except Exception as e:
+            print(f"Error in scene analysis: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            return {
+                "scene_type": "unknown",
+                "confidence": 0.0,
+                "description": f"Error during scene analysis: {str(e)}",
+                "objects_present": [],
+                "object_count": 0,
+                "regions": {},
+                "possible_activities": [],
+                "safety_concerns": [],
+                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
+            }
+    def analyze_lighting_conditions(self, image):
+        """
+        分析光照條件。
+        Args:
+            image: 輸入圖像
+        Returns:
+            Dict: 光照分析結果
+        """
+        return self.lighting_analyzer.analyze(image)
     def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
         """
         Process an image for object detection
         Args:
             image: Input image (numpy array or PIL Image)
             model_name: Name of the model to use
             confidence_threshold: Confidence threshold for detection
             filter_classes: Optional list of classes to filter results
         Returns:
             Tuple of (result_image, result_text, stats_data)
         """
         # Get model instance
         model_instance = self.get_model_instance(model_name, confidence_threshold)
         # Initialize key variables
         result = None
         stats = {}
         temp_path = None
         try:
             # Processing input image
             if isinstance(image, np.ndarray):
                 return None, "No image provided. Please upload an image.", {}
             else:
                 pil_image = image
+            # Analyze lighting conditions
+            lighting_info = self.analyze_lighting_conditions(pil_image)
             # Store temp files
             temp_dir = tempfile.gettempdir()  # Use system temp directory
             temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
             temp_path = os.path.join(temp_dir, temp_filename)
             pil_image.save(temp_path)
             # Object detection
             result = model_instance.detect(temp_path)
             if result is None:
                 return None, "Detection failed. Please try again with a different image.", {}
             # Calculate stats
             stats = EvaluationMetrics.calculate_basic_stats(result)
             # Add space calculation
             spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
             stats["spatial_metrics"] = spatial_metrics
+            # Add lighting information
+            stats["lighting_conditions"] = lighting_info
             # Apply filter if specified
             if filter_classes and len(filter_classes) > 0:
                 # Get classes, boxes, confidence
                 classes = result.boxes.cls.cpu().numpy().astype(int)
                 confs = result.boxes.conf.cpu().numpy()
                 boxes = result.boxes.xyxy.cpu().numpy()
                 mask = np.zeros_like(classes, dtype=bool)
                 for cls_id in filter_classes:
                     mask = np.logical_or(mask, classes == cls_id)
                 filtered_stats = {
                     "total_objects": int(np.sum(mask)),
                     "class_statistics": {},
                     "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
+                    "spatial_metrics": stats["spatial_metrics"],
+                    "lighting_conditions": lighting_info
                 }
                 # Update stats
                 names = result.names
                 for cls, conf in zip(classes[mask], confs[mask]):
                             "count": 0,
                             "average_confidence": 0
                         }
                     filtered_stats["class_statistics"][cls_name]["count"] += 1
                     filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
                 stats = filtered_stats
             viz_data = EvaluationMetrics.generate_visualization_data(
                 result,
                 self.color_mapper.get_all_colors()
             )
             result_image = VisualizationHelper.visualize_detection(
                 temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
             )
             result_text = EvaluationMetrics.format_detection_summary(viz_data)
+            if result is not None:
+                # Perform scene analysis with lighting info
+                scene_analysis = self.analyze_scene(result, lighting_info)
+                # Add scene analysis to stats
+                stats["scene_analysis"] = scene_analysis
             return result_image, result_text, stats
         except Exception as e:
             error_message = f"Error Occurs: {str(e)}"
             import traceback
             traceback.print_exc()
             print(error_message)
             return None, error_message, {}
         finally:
             if temp_path and os.path.exists(temp_path):
                 try:
                     os.remove(temp_path)
                 except Exception as e:
                     print(f"Cannot delete temp files {temp_path}: {str(e)}")
     def format_result_text(self, stats: Dict) -> str:
         """
         Format detection statistics into readable text with improved spacing
         Args:
             stats: Dictionary containing detection statistics
         Returns:
             Formatted text summary
         """
         if not stats or "total_objects" not in stats:
             return "No objects detected."
         # 減少不必要的空行
         lines = [
             f"Detected {stats['total_objects']} objects.",
             f"Average confidence: {stats.get('average_confidence', 0):.2f}",
             "Objects by class:"
         ]
         if "class_statistics" in stats and stats["class_statistics"]:
             # 按計數排序類別
             sorted_classes = sorted(
                 key=lambda x: x[1]["count"],
                 reverse=True
             )
             for cls_name, cls_stats in sorted_classes:
                 count = cls_stats["count"]
                 conf = cls_stats.get("average_confidence", 0)
                 item_text = "item" if count == 1 else "items"
                 lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
         else:
             lines.append("No class information available.")
         # 添加空間信息
         if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
             lines.append("Object Distribution:")
             dist = stats["spatial_metrics"]["spatial_distribution"]
             x_mean = dist.get("x_mean", 0)
             y_mean = dist.get("y_mean", 0)
             # 描述物體的大致位置
             if x_mean < 0.33:
                 h_pos = "on the left side"
                 h_pos = "in the center"
             else:
                 h_pos = "on the right side"
             if y_mean < 0.33:
                 v_pos = "in the upper part"
             elif y_mean < 0.67:
                 v_pos = "in the middle"
             else:
                 v_pos = "in the lower part"
             lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")
         return "\n".join(lines)
     def format_json_for_display(self, stats: Dict) -> Dict:
         """
         Format statistics JSON for better display
         Args:
             stats: Raw statistics dictionary
         Returns:
             Formatted statistics structure for display
         """
         # Create a cleaner copy of the stats for display
         display_stats = {}
         # Add summary section
         display_stats["summary"] = {
             "total_objects": stats.get("total_objects", 0),
             "average_confidence": round(stats.get("average_confidence", 0), 3)
         }
         # Add class statistics in a more organized way
         if "class_statistics" in stats and stats["class_statistics"]:
             # Sort classes by count (descending)
                 key=lambda x: x[1].get("count", 0),
                 reverse=True
             )
             class_stats = {}
             for cls_name, cls_data in sorted_classes:
                 class_stats[cls_name] = {
                     "count": cls_data.get("count", 0),
                     "average_confidence": round(cls_data.get("average_confidence", 0), 3)
                 }
             display_stats["detected_objects"] = class_stats
         # Simplify spatial metrics
         if "spatial_metrics" in stats:
             spatial = stats["spatial_metrics"]
             # Simplify spatial distribution
             if "spatial_distribution" in spatial:
                 dist = spatial["spatial_distribution"]
                         "y_std": round(dist.get("y_std", 0), 3)
                     }
                 }
             # Add simplified size information
             if "size_distribution" in spatial:
                 size = spatial["size_distribution"]
                     "min_area": round(size.get("min_area", 0), 3),
                     "max_area": round(size.get("max_area", 0), 3)
                 }
         return display_stats
     def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
         """
         Prepare data for visualization based on detection statistics
         Args:
             stats: Detection statistics
             available_classes: Dictionary of available class IDs and names
         Returns:
             Visualization data dictionary
         """
         if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
             return {"error": "No detection data available"}
         # Prepare visualization data
         viz_data = {
             "total_objects": stats.get("total_objects", 0),
             "average_confidence": stats.get("average_confidence", 0),
             "class_data": []
         }
         # Class data
         for cls_name, cls_stats in stats.get("class_statistics", {}).items():
             # Search class ID
                 if name == cls_name:
                     class_id = id
                     break
             cls_data = {
                 "name": cls_name,
                 "class_id": class_id,
                 "average_confidence": cls_stats.get("average_confidence", 0),
                 "color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
             }
             viz_data["class_data"].append(cls_data)
         # Descending order
         viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)
         return viz_data

lighting_analyzer.py ADDED Viewed

	@@ -0,0 +1,811 @@

+import numpy as np
+import cv2
+from typing import Dict, Any, Optional
+class LightingAnalyzer:
+    """
+    分析圖像的光照條件，提供增強的室內or室外判斷和光照類型分類，並專注於光照分析。
+    """
+    def __init__(self, config: Optional[Dict[str, Any]] = None):
+        """
+        初始化光照分析器。
+        Args:
+            config: 可選的配置字典，用於自定義分析參數
+        """
+        self.config = config or self._get_default_config()
+    def analyze(self, image):
+        """
+        分析圖像的光照條件。
+        主要分析入口點，計算基本特徵，判斷室內/室外，確定光照條件。
+        Args:
+            image: 輸入圖像 (numpy array 或 PIL Image)
+        Returns:
+            Dict: 包含光照分析結果的字典
+        """
+        try:
+            # 轉換圖像格式
+            if not isinstance(image, np.ndarray):
+                image_np = np.array(image)
+            else:
+                image_np = image.copy()
+            # 確保 RGB 格式
+            if image_np.shape[2] == 3 and isinstance(image_np, np.ndarray):
+                image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+            else:
+                image_rgb = image_np
+            # 計算基本特徵
+            features = self._compute_basic_features(image_rgb)
+            # 分析室內or室外
+            indoor_result = self._analyze_indoor_outdoor(features)
+            is_indoor = indoor_result["is_indoor"]
+            indoor_probability = indoor_result["indoor_probability"]
+            # 確定光照條件
+            lighting_conditions = self._determine_lighting_conditions(features, is_indoor)
+            # 整合結果
+            result = {
+                "time_of_day": lighting_conditions["time_of_day"],
+                "confidence": float(lighting_conditions["confidence"]),
+                "is_indoor": is_indoor,
+                "indoor_probability": float(indoor_probability),
+                "brightness": {
+                    "average": float(features["avg_brightness"]),
+                    "std_dev": float(features["brightness_std"]),
+                    "dark_ratio": float(features["dark_pixel_ratio"])
+                },
+                "color_info": {
+                    "blue_ratio": float(features["blue_ratio"]),
+                    "yellow_orange_ratio": float(features["yellow_orange_ratio"]),
+                    "gray_ratio": float(features["gray_ratio"]),
+                    "avg_saturation": float(features["avg_saturation"]),
+                    "sky_brightness": float(features["sky_brightness"]),
+                    "color_atmosphere": features["color_atmosphere"],
+                    "warm_ratio": float(features["warm_ratio"]),
+                    "cool_ratio": float(features["cool_ratio"])
+                }
+            }
+            # 添加診斷信息
+            if self.config["include_diagnostics"]:
+                result["diagnostics"] = {
+                    "feature_contributions": indoor_result.get("feature_contributions", {}),
+                    "lighting_diagnostics": lighting_conditions.get("diagnostics", {})
+                }
+            return result
+        except Exception as e:
+            print(f"Error in lighting analysis: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            return {
+                "time_of_day": "unknown",
+                "confidence": 0,
+                "error": str(e)
+            }
+    def _compute_basic_features(self, image_rgb):
+        """
+        計算圖像的基本光照特徵（徹底優化版本）。
+        Args:
+            image_rgb: RGB 格式的圖像 (numpy array)
+        Returns:
+            Dict: 包含計算出的特徵值
+        """
+        # 獲取圖像尺寸
+        height, width = image_rgb.shape[:2]
+        # 根據圖像大小自適應縮放因子
+        base_scale = 4
+        scale_factor = base_scale + min(8, max(0, int((height * width) / (1000 * 1000))))
+        # 創建縮小的圖像以加速處理
+        small_rgb = cv2.resize(image_rgb, (width//scale_factor, height//scale_factor))
+        # 一次性轉換所有顏色空間，避免重複計算
+        hsv_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2HSV)
+        gray_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
+        small_gray = cv2.resize(gray_img, (width//scale_factor, height//scale_factor))
+        # 分離HSV通道
+        h_channel = hsv_img[:,:,0]
+        s_channel = hsv_img[:,:,1]
+        v_channel = hsv_img[:,:,2]
+        # 基本亮度特徵
+        avg_brightness = np.mean(v_channel)
+        brightness_std = np.std(v_channel)
+        dark_pixel_ratio = np.sum(v_channel < 50) / (height * width)
+        # 顏色特徵
+        yellow_orange_mask = ((h_channel >= 15) & (h_channel <= 40))
+        yellow_orange_ratio = np.sum(yellow_orange_mask) / (height * width)
+        blue_mask = ((h_channel >= 90) & (h_channel <= 130))
+        blue_ratio = np.sum(blue_mask) / (height * width)
+        # 特別檢查圖像上部區域，尋找藍天特徵
+        upper_region_h = h_channel[:height//4, :]
+        upper_region_s = s_channel[:height//4, :]
+        upper_region_v = v_channel[:height//4, :]
+        # 藍天通常具有高飽和度的藍色
+        sky_blue_mask = ((upper_region_h >= 90) & (upper_region_h <= 130) &
+                        (upper_region_s > 70) & (upper_region_v > 150))
+        sky_blue_ratio = np.sum(sky_blue_mask) / max(1, upper_region_h.size)
+        gray_mask = (s_channel < 50) & (v_channel > 100)
+        gray_ratio = np.sum(gray_mask) / (height * width)
+        avg_saturation = np.mean(s_channel)
+        # 天空亮度
+        upper_half = v_channel[:height//2, :]
+        sky_brightness = np.mean(upper_half)
+        # 色調分析
+        warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
+        warm_ratio = np.sum(warm_colors) / (height * width)
+        cool_colors = (h_channel >= 180) & (h_channel <= 270)
+        cool_ratio = np.sum(cool_colors) / (height * width)
+        # 確定色彩氛圍
+        if warm_ratio > 0.4:
+            color_atmosphere = "warm"
+        elif cool_ratio > 0.4:
+            color_atmosphere = "cool"
+        else:
+            color_atmosphere = "neutral"
+        # 只在縮小的圖像上計算梯度，大幅提高效能
+        gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
+        gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)
+        vertical_strength = np.mean(np.abs(gy))
+        horizontal_strength = np.mean(np.abs(gx))
+        gradient_ratio = vertical_strength / max(horizontal_strength, 1e-5)
+        # -- 亮度均勻性 --
+        brightness_uniformity = 1 - min(1, brightness_std / max(avg_brightness, 1e-5))
+        # -- 高效的天花板分析 --
+        # 使用更大的下採樣率分析頂部區域
+        top_scale = scale_factor * 2  # 更積極的下採樣
+        top_region = v_channel[:height//4:top_scale, ::top_scale]
+        top_region_std = np.std(top_region)
+        ceiling_uniformity = 1.0 - min(1.0, top_region_std / max(np.mean(top_region), 1e-5))
+        # 使用更簡單的方法檢測上部水平線
+        top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
+        horizontal_lines_strength = np.mean(top_gradients)
+        # 標準化
+        horizontal_line_ratio = min(1.0, horizontal_lines_strength / 40)
+        # 極簡的亮點檢測
+        sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
+        light_threshold = min(220, avg_brightness + 2*brightness_std)
+        is_bright = sampled_v > light_threshold
+        bright_spot_count = np.sum(is_bright)
+        # 圓形光源分析的簡化替代方法
+        circular_light_score = 0
+        indoor_light_score = 0
+        light_distribution_uniformity = 0.5
+        # 只有當檢測到亮點，且不是大量亮點時（可能是室外光反射）才進行光源分析
+        if 1 < bright_spot_count < 20:
+            # 簡單統計亮點分布
+            bright_y, bright_x = np.where(is_bright)
+            if len(bright_y) > 1:
+                # 檢查亮點是否成組出現 - 室內照明常見模式
+                mean_x = np.mean(bright_x)
+                mean_y = np.mean(bright_y)
+                dist_from_center = np.sqrt((bright_x - mean_x)**2 + (bright_y - mean_y)**2)
+                # 如果亮點分布較集中，可能是燈具
+                if np.std(dist_from_center) < np.mean(dist_from_center):
+                    circular_light_score = min(3, len(bright_y) // 2)
+                    light_distribution_uniformity = 0.7
+                # 評估亮點是否位於上部區域，常見於室內頂燈
+                if np.mean(bright_y) < sampled_v.shape[0] / 2:
+                    indoor_light_score = 0.6
+                else:
+                    indoor_light_score = 0.3
+        # 使用邊緣區域梯度來快速估計邊界
+        edge_scale = scale_factor * 2
+        # 只採樣圖像邊緣部分進行分析
+        left_edge = small_gray[:, :small_gray.shape[1]//6]
+        right_edge = small_gray[:, 5*small_gray.shape[1]//6:]
+        top_edge = small_gray[:small_gray.shape[0]//6, :]
+        # 計算每個邊緣區域的梯度強度
+        left_gradient = np.mean(np.abs(cv2.Sobel(left_edge, cv2.CV_32F, 1, 0, ksize=3)))
+        right_gradient = np.mean(np.abs(cv2.Sobel(right_edge, cv2.CV_32F, 1, 0, ksize=3)))
+        top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
+        # 標準化
+        left_edge_density = min(1.0, left_gradient / 50.0)
+        right_edge_density = min(1.0, right_gradient / 50.0)
+        top_edge_density = min(1.0, top_gradient / 50.0)
+        # 封閉環境通常在圖像邊緣有較強的梯度
+        boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
+        # 簡單估計���體邊緣密度
+        edges_density = min(1.0, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100.0)
+        street_line_score = 0
+        # 檢查下半部分是否有強烈的垂直線條
+        bottom_half = small_gray[small_gray.shape[0]//2:, :]
+        bottom_vert_gradient = cv2.Sobel(bottom_half, cv2.CV_32F, 0, 1, ksize=3)
+        strong_vert_lines = np.abs(bottom_vert_gradient) > 50
+        if np.sum(strong_vert_lines) > (bottom_half.size * 0.05):  # 如果超過5%的像素是強垂直線
+            street_line_score = 0.7
+        # 整合所有特徵
+        features = {
+            # 基本亮度和顏色特徵
+            "avg_brightness": avg_brightness,
+            "brightness_std": brightness_std,
+            "dark_pixel_ratio": dark_pixel_ratio,
+            "yellow_orange_ratio": yellow_orange_ratio,
+            "blue_ratio": blue_ratio,
+            "sky_blue_ratio": sky_blue_ratio,
+            "gray_ratio": gray_ratio,
+            "avg_saturation": avg_saturation,
+            "sky_brightness": sky_brightness,
+            "color_atmosphere": color_atmosphere,
+            "warm_ratio": warm_ratio,
+            "cool_ratio": cool_ratio,
+            # 結構特徵
+            "gradient_ratio": gradient_ratio,
+            "brightness_uniformity": brightness_uniformity,
+            "bright_spot_count": bright_spot_count,
+            "vertical_strength": vertical_strength,
+            "horizontal_strength": horizontal_strength,
+            # 室內/室外判斷特徵
+            "ceiling_uniformity": ceiling_uniformity,
+            "horizontal_line_ratio": horizontal_line_ratio,
+            "indoor_light_score": indoor_light_score,
+            "circular_light_count": circular_light_score,
+            "light_distribution_uniformity": light_distribution_uniformity,
+            "boundary_edge_score": boundary_edge_score,
+            "top_region_std": top_region_std,
+            "edges_density": edges_density,
+            # 新增：室外特定特徵
+            "street_line_score": street_line_score
+        }
+        return features
+    def _analyze_indoor_outdoor(self, features):
+        """
+        使用多特徵融合進行室內/室外判斷
+        Args:
+            features: 特徵字典
+        Returns:
+            Dict: 室內/室外判斷結果
+        """
+        # 獲取配置中的特徵權重
+        weights = self.config["indoor_outdoor_weights"]
+        # 初始概率值 - 開始時中性評估
+        indoor_score = 0
+        feature_contributions = {}
+        diagnostics = {}
+        # 1. 藍色區域（天空）特徵 - 藍色區域多通常表示室外
+        if features.get("blue_ratio", 0) > 0.2:
+            # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
+            if (features.get("ceiling_uniformity", 0) > 0.5 or
+                features.get("boundary_edge_score", 0) > 0.3 or
+                features.get("indoor_light_score", 0) > 0.2 or
+                features.get("bright_spot_count", 0) > 0):
+                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
+            else:
+                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
+        else:
+            blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
+        indoor_score += blue_score
+        feature_contributions["blue_ratio"] = blue_score
+        # 判斷視角 - 如果上部有藍天而上下亮度差異大，可能是仰視室外建築
+        if (features.get("sky_blue_ratio", 0) > 0.01 and
+            features["sky_brightness"] > features["avg_brightness"] * 1.1):
+            viewpoint_outdoor_score = -1.8  # 強烈的室外指標
+            indoor_score += viewpoint_outdoor_score
+            feature_contributions["outdoor_viewpoint"] = viewpoint_outdoor_score
+        # 2. 亮度均勻性特徵 - 室內通常光照更均勻
+        uniformity_score = weights["brightness_uniformity"] * features["brightness_uniformity"]
+        indoor_score += uniformity_score
+        feature_contributions["brightness_uniformity"] = uniformity_score
+        # 3. 天花板特徵 - 強化天花板檢測的權重
+        ceiling_contribution = 0
+        if "ceiling_uniformity" in features:
+            ceiling_uniformity = features["ceiling_uniformity"]
+            horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
+            # 增強天花板檢測的影響
+            if ceiling_uniformity > 0.5:
+                ceiling_weight = 3
+                ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
+                if horizontal_line_ratio > 0.2:  # 如果有水平線條，進一步增強
+                    ceiling_contribution *= 1.5
+            elif ceiling_uniformity > 0.4:
+                ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
+            indoor_score += ceiling_contribution
+            feature_contributions["ceiling_features"] = ceiling_contribution
+        # 4. 強化吊燈的檢測
+        light_contribution = 0
+        if "indoor_light_score" in features:
+            indoor_light_score = features["indoor_light_score"]
+            circular_light_count = features.get("circular_light_count", 0)
+            # 加強對特定類型光源的檢測
+            if circular_light_count >= 1:  # 即便只有一個圓形光源也很可能是室內
+                light_contribution = weights.get("light_features", 1.2) * 2.0
+            elif indoor_light_score > 0.3:
+                light_contribution = weights.get("light_features", 1.2) * 1.0
+            indoor_score += light_contribution
+            feature_contributions["light_features"] = light_contribution
+        # 5. 環境封閉度特徵
+        boundary_contribution = 0
+        if "boundary_edge_score" in features:
+            boundary_edge_score = features["boundary_edge_score"]
+            edges_density = features.get("edges_density", 0)
+            # 高邊界評分暗示封閉環境（室內）
+            if boundary_edge_score > 0.3:
+                boundary_contribution = weights.get("boundary_features", 1.2) * 2
+            elif boundary_edge_score > 0.2:
+                boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
+            indoor_score += boundary_contribution
+            feature_contributions["boundary_features"] = boundary_contribution
+        if (features.get("edges_density", 0) > 0.2 and
+            features.get("bright_spot_count", 0) > 5 and
+            features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.5):
+            # 商業街道特徵：高邊緣密度 + 多亮點 + 強垂直特徵
+            street_feature_score = -weights.get("street_features", 1.2) * 1.5
+            indoor_score += street_feature_score
+            feature_contributions["street_features"] = street_feature_score
+        # 添加對亞洲商業街道的專門檢測
+        if (features.get("edges_density", 0) > 0.25 and  # 高邊緣密度
+            features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.8 and  # 更強的垂直結構
+            features.get("brightness_uniformity", 0) < 0.6):  # 較低的亮度均勻性（招牌、燈光等造成）
+            asian_street_score = -2.2  # 非常強的室外代表性特徵
+            indoor_score += asian_street_score
+            feature_contributions["asian_commercial_street"] = asian_street_score
+        # 6. 垂直/水平梯度比率
+        gradient_contribution = 0
+        if features["gradient_ratio"] > 2.0:
+            combined_uniformity = (features["brightness_uniformity"] +
+                                features.get("ceiling_uniformity", 0)) / 2
+            if combined_uniformity > 0.5:
+                gradient_contribution = weights["gradient_ratio"] * 0.7
+            else:
+                gradient_contribution = -weights["gradient_ratio"] * 0.3
+            indoor_score += gradient_contribution
+            feature_contributions["gradient_ratio"] = gradient_contribution
+        # 7. 亮點檢測（光源）
+        bright_spot_contribution = 0
+        bright_spot_count = features["bright_spot_count"]
+        circular_light_count = features.get("circular_light_count", 0)
+        # 調整亮點分析邏輯
+        if circular_light_count >= 1:  # 即使只有一個圓形光源
+            bright_spot_contribution = weights["bright_spots"] * 1.5
+        elif bright_spot_count < 5:  # 適當放寬閾值
+            bright_spot_contribution = weights["bright_spots"] * 0.5
+        elif bright_spot_count > 15:  # 大量亮點比較有可能為室外
+            bright_spot_contribution = -weights["bright_spots"] * 0.4
+        indoor_score += bright_spot_contribution
+        feature_contributions["bright_spots"] = bright_spot_contribution
+        # 8. 色調分析
+        yellow_contribution = 0
+        if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
+            if features.get("indoor_light_score", 0) > 0.2:
+                yellow_contribution = weights["color_tone"] * 0.8
+            else:
+                yellow_contribution = weights["color_tone"] * 0.5
+            indoor_score += yellow_contribution
+            feature_contributions["yellow_tone"] = yellow_contribution
+        if features.get("blue_ratio", 0) > 0.7:
+            # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
+            if (features.get("ceiling_uniformity", 0) > 0.6 or
+                features.get("boundary_edge_score", 0) > 0.3 or
+                features.get("indoor_light_score", 0) > 0):
+                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
+            else:
+                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
+        else:
+            blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
+        # 9. 上半部與下半部亮度對比
+        sky_contribution = 0
+        if features["sky_brightness"] > features["avg_brightness"] * 1.3:
+            if features["blue_ratio"] > 0.15:
+                sky_contribution = -weights["sky_brightness"] * 0.9
+            else:
+                sky_contribution = -weights["sky_brightness"] * 0.6
+            indoor_score += sky_contribution
+            feature_contributions["sky_brightness"] = sky_contribution
+        # 加入額外的餐廳特徵檢測邏輯
+        dining_feature_contribution = 0
+        # 檢測中央懸掛式燈具，有懸掛燈代表有天花板，就代表是室內
+        if circular_light_count >= 1 and features.get("light_distribution_uniformity", 0) > 0.4:
+            dining_feature_contribution = 1.5
+            indoor_score += dining_feature_contribution
+            feature_contributions["dining_features"] = dining_feature_contribution
+        # 10. 增強的藍天的檢測，即便是小面積的藍天也是很強的室外指標
+        sky_contribution = 0
+        if "sky_blue_ratio" in features:
+            # 只有當藍色區域集中在上部且亮度高時，才認為是藍天
+            if features["sky_blue_ratio"] > 0.01 and features["sky_brightness"] > features.get("avg_brightness", 0) * 1.2:
+                sky_outdoor_score = -2.5 * features["sky_blue_ratio"] * weights.get("blue_ratio", 1.2)
+                indoor_score += sky_outdoor_score
+                feature_contributions["sky_blue_detection"] = sky_outdoor_score
+        asian_street_indicators = 0
+        # 1: 高垂直結構強度
+        vertical_ratio = features.get("vertical_strength", 0) / max(features.get("horizontal_strength", 1e-5), 1e-5)
+        if vertical_ratio > 1.8:
+            asian_street_indicators += 1
+        # 2: 高邊緣密度 + 路面標記特徵
+        if features.get("edges_density", 0) > 0.25 and features.get("street_line_score", 0) > 0.2:
+            asian_street_indicators += 2
+        # 3: 多個亮點 + 亮度不均勻
+        if features.get("bright_spot_count", 0) > 5 and features.get("brightness_uniformity", 0) < 0.6:
+            asian_street_indicators += 1
+        # 4: 藍色區域小（天空被高樓遮擋）但亮度高
+        if features.get("blue_ratio", 0) < 0.1 and features.get("sky_brightness", 0) > features.get("avg_brightness", 0) * 1.1:
+            asian_street_indicators += 1
+        # 如果滿足至少 3 個指標，調整權重變成偏向室外的判斷
+        if asian_street_indicators >= 3:
+            # 記錄檢測到的模式
+            feature_contributions["asian_street_pattern"] = -2.5
+            indoor_score += -2.5  # 明顯向室外傾斜
+            # 降低室內指標的權重
+            if "boundary_features" in feature_contributions:
+                adjusted_contribution = feature_contributions["boundary_features"] * 0.4
+                indoor_score -= (feature_contributions["boundary_features"] - adjusted_contribution)
+                feature_contributions["boundary_features"] = adjusted_contribution
+            if "ceiling_features" in feature_contributions:
+                adjusted_contribution = feature_contributions["ceiling_features"] * 0.3
+                indoor_score -= (feature_contributions["ceiling_features"] - adjusted_contribution)
+                feature_contributions["ceiling_features"] = adjusted_contribution
+            # 添加信息到診斷數據
+            diagnostics["asian_street_detected"] = True
+            diagnostics["asian_street_indicators"] = asian_street_indicators
+        bedroom_indicators = 0
+        # 1: 窗戶和牆壁形成的直角
+        if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
+            bedroom_indicators += 1.5  # 增加權重
+        # 2: 天花板和光源
+        if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
+            bedroom_indicators += 2.5
+        # 3: 良好對比度的牆壁顏色，適合臥房還有客廳
+        if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
+            bedroom_indicators += 1.5
+        # 特殊的檢測 4: 檢測窗戶
+        if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
+            bedroom_indicators += 1.5
+        # 如果滿足足夠的家居指標，提高多點室內判斷分數
+        if bedroom_indicators >= 3:
+            # 增加家居環境評分
+            home_env_score = 3
+            indoor_score += home_env_score
+            feature_contributions["home_environment_pattern"] = home_env_score
+        elif bedroom_indicators >= 2:
+            # 適度增加家居環境評分
+            home_env_score = 2
+            indoor_score += home_env_score
+            feature_contributions["home_environment_pattern"] = home_env_score
+        # 根據總分轉換為���率（使用sigmoid函數）
+        indoor_probability = 1 / (1 + np.exp(-indoor_score * 0.22))
+        # 判斷結果
+        is_indoor = indoor_probability > 0.5
+        return {
+            "is_indoor": is_indoor,
+            "indoor_probability": indoor_probability,
+            "indoor_score": indoor_score,
+            "feature_contributions": feature_contributions,
+            "diagnostics": diagnostics
+        }
+    def _determine_lighting_conditions(self, features, is_indoor):
+        """
+        基於特徵和室內/室外判斷確定光照條件。
+        Args:
+            features: 特徵字典
+            is_indoor: 是否是室內環境
+        Returns:
+            Dict: 光照條件分析結果
+        """
+        # 初始化
+        time_of_day = "unknown"
+        confidence = 0.5
+        diagnostics = {}
+        avg_brightness = features["avg_brightness"]
+        dark_pixel_ratio = features["dark_pixel_ratio"]
+        yellow_orange_ratio = features["yellow_orange_ratio"]
+        blue_ratio = features["blue_ratio"]
+        gray_ratio = features["gray_ratio"]
+        # 基於室內/室外分別判斷
+        if is_indoor:
+            # 計算室內住宅自然光指標
+            natural_window_light = 0
+            # 檢查窗戶特徵和光線特性
+            if (features.get("blue_ratio", 0) > 0.1 and
+                features.get("sky_brightness", 0) > avg_brightness * 1.1):
+                natural_window_light += 1
+            # 檢查均勻柔和的光線分布
+            if (features.get("brightness_uniformity", 0) > 0.65 and
+                features.get("brightness_std", 0) < 70):
+                natural_window_light += 1
+            # 檢查暖色調比例
+            if features.get("warm_ratio", 0) > 0.2:
+                natural_window_light += 1
+            # 家居環境指標
+            home_env_score = features.get("home_environment_pattern", 0)
+            if home_env_score > 1.5:
+                natural_window_light += 1
+            # 1. 室內明亮環境，可能有窗戶自然光
+            if avg_brightness > 130:
+                # 檢測自然光住宅空間 - 新增類型!
+                if natural_window_light >= 2 and home_env_score > 1.5:
+                    time_of_day = "indoor_residential_natural"  # 家裡的自然光類型
+                    confidence = 0.8
+                    diagnostics["reason"] = "Bright residential space with natural window lighting"
+                # 檢查窗戶特徵 - 如果有明亮的窗戶且色調為藍
+                elif features.get("blue_ratio", 0) > 0.1 and features.get("sky_brightness", 0) > 150:
+                    time_of_day = "indoor_bright"
+                    confidence = 0.8
+                    diagnostics["reason"] = "Bright indoor scene with window light"
+                else:
+                    time_of_day = "indoor_bright"
+                    confidence = 0.75
+                    diagnostics["reason"] = "High brightness in indoor environment"
+            # 2. 室內中等亮度環境
+            elif avg_brightness > 100:
+                time_of_day = "indoor_moderate"
+                confidence = 0.7
+                diagnostics["reason"] = "Moderate brightness in indoor environment"
+            # 3. 室內低光照環境
+            else:
+                time_of_day = "indoor_dim"
+                confidence = 0.65 + dark_pixel_ratio / 3
+                diagnostics["reason"] = "Low brightness in indoor environment"
+            # 1. 檢測設計師風格住宅，可以偵測到比較多種類的狀況
+            designer_residential_score = 0
+            # 檢測特色燈具
+            if (features.get("circular_light_count", 0) > 0 or features.get("bright_spot_count", 0) > 2):
+                designer_residential_score += 1
+            # 檢測高品質均勻照明
+            if features.get("brightness_uniformity", 0) > 0.7:
+                designer_residential_score += 1
+            # 檢測溫暖色調
+            if features.get("warm_ratio", 0) > 0.3:
+                designer_residential_score += 1
+            # 檢測家居環境特徵
+            if home_env_score > 1.5:
+                designer_residential_score += 1
+            if designer_residential_score >= 3 and home_env_score > 1.5:
+                time_of_day = "indoor_designer_residential"
+                confidence = 0.85
+                diagnostics["special_case"] = "Designer residential lighting with decorative elements"
+            # 2. 檢測餐廳/酒吧場景
+            elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
+                if features["warm_ratio"] > 0.4:
+                    time_of_day = "indoor_restaurant"
+                    confidence = 0.65 + yellow_orange_ratio / 4
+                    diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
+            # 3. 檢測商業照明空間
+            elif avg_brightness > 120 and features["bright_spot_count"] > 4:
+                # 增加商業照明判別的精確度
+                commercial_score = 0
+                # 多個亮點
+                commercial_score += min(1.0, features["bright_spot_count"] * 0.05)
+                # 不太可能是住宅的指標
+                if features.get("home_environment_pattern", 0) < 1.5:
+                    commercial_score += 0.5
+                # 整體照明結構化布局
+                if features.get("light_distribution_uniformity", 0) > 0.6:
+                    commercial_score += 0.5
+                if commercial_score > 0.6 and designer_residential_score < 3:
+                    time_of_day = "indoor_commercial"
+                    confidence = 0.7 + commercial_score / 5
+                    diagnostics["special_case"] = "Multiple structured light sources suggest commercial lighting"
+        else:
+            # 室外場景判斷保持不變
+            if avg_brightness < 90:  # 降低夜間判斷的亮度閾值
+                # 檢測是否有車燈/街燈
+                has_lights = features["bright_spot_count"] > 3
+                if has_lights:
+                    time_of_day = "night"
+                    confidence = 0.8 + dark_pixel_ratio / 5
+                    diagnostics["reason"] = "Low brightness with light sources detected"
+                    # 檢查是否是霓虹燈場景
+                    if yellow_orange_ratio > 0.15 and features["bright_spot_count"] > 5:
+                        time_of_day = "neon_night"
+                        confidence = 0.75 + yellow_orange_ratio / 3
+                        diagnostics["special_case"] = "Multiple colorful light sources suggest neon lighting"
+                else:
+                    time_of_day = "night"
+                    confidence = 0.7 + dark_pixel_ratio / 3
+                    diagnostics["reason"] = "Low brightness outdoor scene"
+            elif avg_brightness < 130 and yellow_orange_ratio > 0.2:
+                time_of_day = "sunset/sunrise"
+                confidence = 0.7 + yellow_orange_ratio / 3
+                diagnostics["reason"] = "Moderate brightness with yellow-orange tones"
+            elif avg_brightness > 150 and blue_ratio > 0.15:
+                time_of_day = "day_clear"
+                confidence = 0.7 + blue_ratio / 3
+                diagnostics["reason"] = "High brightness with blue tones (likely sky)"
+            elif avg_brightness > 130:
+                time_of_day = "day_cloudy"
+                confidence = 0.7 + gray_ratio / 3
+                diagnostics["reason"] = "Good brightness with higher gray tones"
+            else:
+                # 默認判斷
+                if yellow_orange_ratio > gray_ratio:
+                    time_of_day = "sunset/sunrise"
+                    confidence = 0.6 + yellow_orange_ratio / 3
+                    diagnostics["reason"] = "Yellow-orange tones dominant"
+                else:
+                    time_of_day = "day_cloudy"
+                    confidence = 0.6 + gray_ratio / 3
+                    diagnostics["reason"] = "Gray tones dominant"
+            # 檢查是否是特殊室外場景（如體育場）
+            if avg_brightness > 120 and features["brightness_uniformity"] > 0.8:
+                # 高亮度且非常均勻的光照可能是體育場燈光
+                time_of_day = "stadium_lighting"
+                confidence = 0.7
+                diagnostics["special_case"] = "Uniform bright lighting suggests stadium/sports lighting"
+            # 檢查是否是混合光照（如室內/室外過渡區）
+            if 100 < avg_brightness < 150 and 0.1 < blue_ratio < 0.2:
+                if features["gradient_ratio"] > 1.5:
+                    time_of_day = "mixed_lighting"
+                    confidence = 0.65
+                    diagnostics["special_case"] = "Features suggest indoor-outdoor transition area"
+        # 確保信心值在 0-1 範圍內
+        confidence = min(0.95, max(0.5, confidence))
+        if time_of_day in ["indoor_residential_natural", "indoor_designer_residential"] and hasattr(self, "config"):
+            # 確保 LIGHTING_CONDITIONS 中有這些新類型的描述
+            if time_of_day == "indoor_residential_natural":
+                lightingType = {
+                    "template_modifiers": {
+                        "indoor_residential_natural": "naturally-lit residential"
+                    },
+                    "time_descriptions": {
+                        "indoor_residential_natural": {
+                            "general": "The scene is captured in a residential space with ample natural light from windows.",
+                            "bright": "The residential space is brightly lit with natural daylight streaming through windows.",
+                            "medium": "The home environment has good natural lighting providing a warm, inviting atmosphere.",
+                            "dim": "The living space has soft natural light filtering through windows or openings."
+                        }
+                    }
+                }
+            elif time_of_day == "indoor_designer_residential":
+                lightingType = {
+                    "template_modifiers": {
+                        "indoor_designer_residential": "designer-lit residential"
+                    },
+                    "time_descriptions": {
+                        "indoor_designer_residential": {
+                            "general": "The scene is captured in a residential space with carefully designed lighting elements.",
+                            "bright": "The home features professionally designed lighting with decorative fixtures creating a bright atmosphere.",
+                            "medium": "The residential interior showcases curated lighting design balancing form and function.",
+                            "dim": "The living space has thoughtfully placed designer lighting creating an intimate ambiance."
+                        }
+                    }
+                }
+        return {
+            "time_of_day": time_of_day,
+            "confidence": confidence,
+            "diagnostics": diagnostics
+        }
+    def _get_default_config(self):
+        """
+        返回優化版本的默認配置參數。
+        """
+        return {
+            "indoor_outdoor_weights": {
+                "blue_ratio": 0.6,
+                "brightness_uniformity": 1.2,
+                "gradient_ratio": 0.7,
+                "bright_spots": 0.8,
+                "color_tone": 0.5,
+                "sky_brightness": 0.9,
+                "brightness_variation": 0.7,
+                "ceiling_features": 1.5,
+                "light_features": 1.1,
+                "boundary_features": 2.8,
+                "street_features": 2.0,
+                "building_features": 1.6
+            },
+            "include_diagnostics": True
+        }

lighting_conditions.py ADDED Viewed

	@@ -0,0 +1,131 @@

+LIGHTING_CONDITIONS = {
+    "time_descriptions": {
+        "day_clear": {
+        "general": "The scene is captured during clear daylight hours with bright natural lighting.",
+        "bright": "The scene is brightly lit with strong, clear daylight.",
+        "medium": "The scene is illuminated with moderate daylight under clear conditions.",
+        "dim": "The scene is captured in soft daylight on a clear day."
+        },
+        "day_cloudy": {
+        "general": "The scene is captured during daytime under overcast conditions.",
+        "bright": "The scene has the diffused bright lighting of an overcast day.",
+        "medium": "The scene has even, soft lighting typical of a cloudy day.",
+        "dim": "The scene has the muted lighting of a heavily overcast day."
+        },
+        "sunset/sunrise": {
+        "general": "The scene is captured during golden hour with warm lighting.",
+        "bright": "The scene is illuminated with bright golden hour light with long shadows.",
+        "medium": "The scene has the warm orange-yellow glow typical of sunset or sunrise.",
+        "dim": "The scene has soft, warm lighting characteristic of early sunrise or late sunset."
+        },
+        "night": {
+        "general": "The scene is captured at night with limited natural lighting.",
+        "bright": "The scene is captured at night but well-lit with artificial lighting.",
+        "medium": "The scene is captured at night with moderate artificial lighting.",
+        "dim": "The scene is captured in low-light night conditions with minimal illumination."
+        },
+        "indoor_bright": {
+        "general": "The scene is captured indoors with ample lighting.",
+        "bright": "The indoor space is brightly lit, possibly with natural light from windows.",
+        "medium": "The indoor space has good lighting conditions.",
+        "dim": "The indoor space has adequate lighting."
+        },
+        "indoor_moderate": {
+        "general": "The scene is captured indoors with moderate lighting.",
+        "bright": "The indoor space has comfortable, moderate lighting.",
+        "medium": "The indoor space has standard interior lighting.",
+        "dim": "The indoor space has somewhat subdued lighting."
+        },
+        "indoor_dim": {
+        "general": "The scene is captured indoors with dim or mood lighting.",
+        "bright": "The indoor space has dim but sufficient lighting.",
+        "medium": "The indoor space has low, atmospheric lighting.",
+        "dim": "The indoor space has very dim, possibly mood-oriented lighting."
+        },
+        "beach_daylight": {
+            "general": "The scene is captured during daytime at a beach with bright natural sunlight.",
+            "bright": "The beach scene is intensely illuminated by direct sunlight.",
+            "medium": "The coastal area has even natural daylight.",
+            "dim": "The beach has softer lighting, possibly from a partially cloudy sky."
+        },
+        "sports_arena": {
+            "general": "The scene is captured in a sports venue with specialized arena lighting.",
+            "bright": "The sports facility is brightly illuminated with powerful overhead lights.",
+            "medium": "The venue has standard sports event lighting providing clear visibility.",
+            "dim": "The sports area has reduced illumination, possibly before or after an event."
+        },
+        "kitchen_working": {
+            "general": "The scene is captured in a professional kitchen with task-oriented lighting.",
+            "bright": "The kitchen is intensely illuminated with clear, functional lighting.",
+            "medium": "The culinary space has standard working lights focused on preparation areas.",
+            "dim": "The kitchen has reduced lighting, possibly during off-peak hours."
+        },
+        "unknown": {
+        "general": "The lighting conditions in this scene are not easily determined."
+        }
+    },
+    "template_modifiers": {
+        "day_clear": "brightly-lit",
+        "day_cloudy": "softly-lit",
+        "sunset/sunrise": "warmly-lit",
+        "night": "night-time",
+        "indoor_bright": "well-lit indoor",
+        "indoor_moderate": "indoor",
+        "indoor_dim": "dimly-lit indoor",
+        "indoor_commercial": "retail-lit",
+        "indoor_restaurant": "atmospherically-lit",
+        "neon_night": "neon-illuminated",
+        "stadium_lighting": "flood-lit",
+        "mixed_lighting": "transitionally-lit",
+        "beach_lighting": "sun-drenched",
+        "sports_venue_lighting": "arena-lit",
+        "professional_kitchen_lighting": "kitchen-task lit",
+        "unknown": ""
+    },
+    "activity_modifiers": {
+        "day_clear": ["active", "lively", "busy"],
+        "day_cloudy": ["calm", "relaxed", "casual"],
+        "sunset/sunrise": ["peaceful", "transitional", "atmospheric"],
+        "night": ["quiet", "subdued", "nocturnal"],
+        "indoor_bright": ["focused", "productive", "engaged"],
+        "indoor_moderate": ["comfortable", "social", "casual"],
+        "indoor_dim": ["intimate", "relaxed", "private"],
+        "indoor_commercial": ["shopping", "browsing", "consumer-oriented"],
+        "indoor_restaurant": ["dining", "social", "culinary"],
+        "neon_night": ["vibrant", "energetic", "night-life"],
+        "stadium_lighting": ["event-focused", "spectator-oriented", "performance-based"],
+        "mixed_lighting": ["transitional", "adaptable", "variable"],
+        "unknown": []
+    },
+    "indoor_commercial": {
+    "general": "The scene is captured inside a commercial setting with retail-optimized lighting.",
+    "bright": "The space is brightly illuminated with commercial display lighting to highlight merchandise.",
+    "medium": "The commercial interior has standard retail lighting that balances visibility and ambiance.",
+    "dim": "The commercial space has subdued lighting creating an upscale or intimate shopping atmosphere."
+    },
+    "indoor_restaurant": {
+        "general": "The scene is captured inside a restaurant with characteristic dining lighting.",
+        "bright": "The restaurant is well-lit with clear illumination emphasizing food presentation.",
+        "medium": "The dining space has moderate lighting striking a balance between functionality and ambiance.",
+        "dim": "The restaurant features soft, low lighting creating an intimate dining atmosphere."
+    },
+    "neon_night": {
+        "general": "The scene is captured at night with colorful neon lighting typical of entertainment districts.",
+        "bright": "The night scene is illuminated by vibrant neon signs creating a lively, colorful atmosphere.",
+        "medium": "The evening setting features moderate neon lighting creating a characteristic urban nightlife scene.",
+        "dim": "The night area has subtle neon accents against the darkness, creating a moody urban atmosphere."
+    },
+    "stadium_lighting": {
+        "general": "The scene is captured under powerful stadium lights designed for spectator events.",
+        "bright": "The venue is intensely illuminated by stadium floodlights creating daylight-like conditions.",
+        "medium": "The sports facility has standard event lighting providing clear visibility across the venue.",
+        "dim": "The stadium has reduced illumination typical of pre-event or post-event conditions."
+    },
+    "mixed_lighting": {
+        "general": "The scene features a mix of indoor and outdoor lighting creating transitional illumination.",
+        "bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
+        "medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
+        "dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
+    }
+}

object_categories.py ADDED Viewed

	@@ -0,0 +1,8 @@

+OBJECT_CATEGORIES = {
+                "furniture": [56, 57, 58, 59, 60, 61],
+                "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
+                "kitchen_items": [39, 40, 41, 42, 43, 44, 45],
+                "food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
+                "vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
+                "personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
+            }

object_template_fillers.py ADDED Viewed

	@@ -0,0 +1,78 @@

+OBJECT_TEMPLATE_FILLERS = {
+                "furniture": ["designer chairs", "wooden dining table", "stylish seating", "upholstered armchairs", "elegant dining furniture"],
+                "design_elements": ["art pieces", "decorative wreaths", "statement lighting", "seasonal decorations", "sophisticated decor"],
+                "lighting": ["pendant lights", "decorative fixtures", "geometric lighting", "modern chandeliers", "ambient illumination"],
+                "table_setup": ["elegantly set table", "tabletop decorations", "seasonal centerpieces", "formal place settings", "floral arrangements"],
+                "seating": ["upholstered chairs", "accent armchairs", "mixed seating styles", "designer dining chairs", "comfortable dining seats"],
+                "table_description": ["solid wood table", "designer dining table", "expansive dining surface", "artisanal table", "statement dining table"],
+                "storefront_features": ["multi-story shops", "illuminated signs", "merchandise displays", "compact storefronts", "vertical retail spaces"],
+                "pedestrian_flow": ["people walking", "shoppers", "pedestrians", "locals and tourists", "urban foot traffic"],
+                "asian_elements": ["Asian language signage", "decorative lanterns", "local storefronts", "character-based text", "regional design elements"],
+                "cultural_elements": ["red lanterns", "local typography", "distinctive architecture", "cultural symbols", "traditional decorations"],
+                "signage": ["bright store signs", "multilingual text", "vertical signboards", "neon displays", "electronic advertisements"],
+                "street_activities": ["shopping", "commuting", "socializing", "vendor transactions", "urban navigation"],
+                "buildings": ["high-rise office buildings", "corporate towers", "skyscrapers", "financial institutions", "commercial headquarters"],
+                "traffic_elements": ["vehicle lights", "trams/street cars", "lane markers", "traffic signals", "urban transit"],
+                "skyscrapers": ["glass and steel buildings", "tall structures", "modern architecture", "office towers", "urban high-rises"],
+                "road_features": ["wide avenues", "tram tracks", "traffic lanes", "median dividers", "urban throughways"],
+                "architectural_elements": ["contemporary buildings", "urban design", "varied architectural styles", "corporate architecture", "city planning features"],
+                "city_landmarks": ["distant bridge", "skyline features", "iconic structures", "urban monuments", "signature buildings"],
+                "crossing_pattern": ["zebra crosswalks", "pedestrian walkways", "crosswalk markings", "intersection design", "safety stripes"],
+                "pedestrian_density": ["groups of people", "commuters", "diverse pedestrians", "urban crowds", "varying foot traffic"],
+                "pedestrian_behavior": ["walking in different directions", "crossing together", "waiting for signals", "navigating intersections", "following traffic rules"],
+                "traffic_pattern": ["four-way intersection", "crossroad", "junction", "multi-directional traffic", "regulated crossing"],
+                "pedestrian_flow": ["people crossing", "directional movement", "coordinated crossing", "timed pedestrian traffic", "intersection navigation"],
+                "transit_vehicles": ["buses", "trams", "trains", "taxis", "shuttles"],
+                "passenger_activity": ["boarding", "waiting", "exiting vehicles", "checking schedules", "navigating stations"],
+                "transportation_modes": ["public transit", "private vehicles", "ride services", "light rail", "bus systems"],
+                "passenger_needs": ["waiting areas", "information displays", "ticketing services", "transit connections", "seating"],
+                "transit_infrastructure": ["stations", "platforms", "boarding areas", "transit lanes", "signaling systems"],
+                "passenger_movement": ["transfers", "entrances and exits", "queueing", "platform access", "terminal navigation"],
+                "retail_elements": ["storefronts", "display windows", "shopping bags", "merchandise", "retail signage"],
+                "shopping_activity": ["browsing", "carrying purchases", "window shopping", "social shopping", "consumer activities"],
+                "store_types": ["boutiques", "brand stores", "local shops", "chain retailers", "specialty stores"],
+                "walkway_features": ["pedestrian paths", "shopping promenades", "retail corridors", "commercial walkways", "shopping streets"],
+                "commercial_signage": ["brand logos", "sale announcements", "store names", "advertising displays", "digital signage"],
+                "consumer_behavior": ["shopping in groups", "individual browsing", "carrying bags", "examining products", "moving between stores"],
+                "beach_equipment": ["beach umbrellas", "surfboards", "beach towels", "sun protection", "recreational equipment"],
+                "water_activities": ["water sports", "surfing", "beach recreation", "sun bathing", "coastal leisure"],
+                "sports_equipment": ["game balls", "professional equipment", "athletic gear", "sports apparatus", "competition items"],
+                "competitive_activities": ["team sports", "athletic contests", "competitive games", "sporting events", "professional matches"],
+                "kitchen_equipment": ["professional appliances", "cooking stations", "preparation surfaces", "culinary tools", "industrial equipment"],
+                "food_preparation": ["meal production", "culinary operations", "food service preparation", "commercial cooking", "kitchen workflow"],
+                "crossing_pattern": ["grid-like pedestrian crossings", "multi-directional crosswalks", "cross-shaped intersection design", "perpendicular crossing lanes", "zebra-striped crosswalks viewed from above"],
+                "pedestrian_pattern": ["scattered distribution of people", "organized flow of pedestrians", "clustered gatherings", "radial movement patterns", "linear procession of individuals"],
+                "commercial_layout": ["parallel shopping streets", "interconnected shopping blocks", "radial marketplace design", "grid-like retail arrangement", "meandering commercial pathways"],
+                "movement_pattern": ["circular crowd motion", "directional pedestrian flow", "scattered individual movement", "converging foot traffic", "diverging pedestrian patterns"],
+                "stall_elements": ["food vendors with steaming woks", "trinket sellers with colorful displays", "lantern-lit stalls", "bamboo-framed shops", "canvas-covered market stands"],
+                "asian_elements": ["hanging red lanterns", "character-based signage", "ornate temple decorations", "traditional paper decorations", "stylized gateway arches"],
+                "cultural_lighting": ["paper lantern illumination", "neon character signs", "strung festival lights", "hanging light chains", "colorful shop front lighting"],
+                "architectural_elements": ["tiered pagoda roofs", "ornate dragon sculptures", "stone guardian statues", "intricately carved railings", "traditional wooden beams"],
+                "cultural_symbols": ["dharma wheels", "lotus motifs", "yin-yang symbols", "zodiac animal representations", "traditional calligraphy"],
+                "architectural_style": ["Baroque facades", "Gothic spires", "Renaissance colonnades", "Neoclassical pediments", "Medieval archways"],
+                "european_features": ["cobblestone paving", "ornate fountains", "bronze statuary", "wrought iron lampposts", "cafe terraces"],
+                "lighting_effects": ["streetlamp pools of light", "neon sign glow", "illuminated window squares", "headlight streams", "traffic signal flashes"],
+                "illuminated_elements": ["lit storefront windows", "glowing traffic signals", "illuminated advertising", "headlight-lit streets", "backlit silhouettes"],
+                "neon_elements": ["colorful shop signs", "animated light displays", "illuminated brand logos", "glowing storefront outlines", "digital advertising screens"],
+                "illuminated_signage": ["bright LED displays", "glowing brand names", "projected light advertisements", "illuminated menu boards", "digital information screens"],
+                "colorful_lighting": ["multi-colored neon", "warm ambient illumination", "cool blue accent lights", "festive string lighting", "dynamic color-changing displays"],
+                "transitional_elements": ["retractable glass walls", "indoor-outdoor bar counters", "terraced seating areas", "threshold planters", "partial canopy coverage"],
+                "indoor_features": ["climate-controlled spaces", "soft seating arrangements", "interior decor accents", "mood lighting fixtures", "sound-dampened areas"],
+                "outdoor_setting": ["sidewalk tables", "patio seating", "garden furniture", "open-air counters", "courtyard arrangements"],
+                "seating_arrangement": ["tiered spectator stands", "premium viewing boxes", "courtside seating", "general admission benches", "stadium chair rows"],
+                "playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
+                "construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
+                "medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
+                "educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"]
+            }

requirements.txt CHANGED Viewed

@@ -6,3 +6,4 @@ pillow>=9.4.0
 numpy>=1.23.5
 matplotlib>=3.7.0
 gradio>=3.32.0

 numpy>=1.23.5
 matplotlib>=3.7.0
 gradio>=3.32.0
+git+https://github.com/openai/CLIP.git

room_02.jpg ADDED Viewed

Git LFS Details

SHA256: 1171134f1f68356aaa0639c029e1d9f2072452178b3ae714f269f969fb4e587e
Pointer size: 132 Bytes
Size of remote file: 2.36 MB

safety_templates.py ADDED Viewed

	@@ -0,0 +1,5 @@

+SAFETY_TEMPLATES = {
+    "general": "Pay attention to {safety_element}.",
+    "warning": "Be cautious of {hazard} in this environment.",
+    "notice": "Note the presence of {element_of_interest}."
+}

scene_analyzer.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import os
+import numpy as np
+from typing import Dict, List, Tuple, Any, Optional
+from spatial_analyzer import SpatialAnalyzer
+from scene_description import SceneDescriptor
+from enhance_scene_describer import EnhancedSceneDescriber
+from clip_analyzer import CLIPAnalyzer
+from scene_type import SCENE_TYPES
+from object_categories import OBJECT_CATEGORIES
+class SceneAnalyzer:
+    """
+    Core class for scene analysis and understanding based on object detection results.
+    Analyzes detected objects, their relationships, and infers the scene type.
+    """
+    def __init__(self, class_names: Dict[int, str] = None):
+        """
+        Initialize the scene analyzer with optional class name mappings.
+        Args:
+            class_names: Dictionary mapping class IDs to class names (optional)
+        """
+        self.class_names = class_names
+        # 加載場景類型和物體類別
+        self.SCENE_TYPES = SCENE_TYPES
+        self.OBJECT_CATEGORIES = OBJECT_CATEGORIES
+        # 初始化其他組件，將數據傳遞給 SceneDescriptor
+        self.spatial_analyzer = SpatialAnalyzer(class_names=class_names, object_categories=self.OBJECT_CATEGORIES)
+        self.descriptor = SceneDescriptor(scene_types=self.SCENE_TYPES, object_categories=self.OBJECT_CATEGORIES)
+        self.scene_describer = EnhancedSceneDescriber(scene_types=self.SCENE_TYPES)
+        # 初始化 CLIP 分析器（新增）
+        try:
+            self.clip_analyzer = CLIPAnalyzer()
+            self.use_clip = True
+        except Exception as e:
+            print(f"Warning: Could not initialize CLIP analyzer: {e}")
+            print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
+            self.use_clip = False
+    def generate_scene_description(self,
+                             scene_type,
+                             detected_objects,
+                             confidence,
+                             lighting_info=None,
+                             functional_zones=None):
+        """
+        生成場景描述。
+        Args:
+            scene_type: 識別的場景類型
+            detected_objects: 檢測到的物體列表
+            confidence: 場景分類置信度
+            lighting_info: 照明條件信息（可選）
+            functional_zones: 功能區域信息（可選）
+        Returns:
+            str: 生成的場景描述
+        """
+        return self.scene_describer.generate_description(
+            scene_type,
+            detected_objects,
+            confidence,
+            lighting_info,
+            functional_zones
+        )
+    def _generate_scene_description(self, scene_type, detected_objects, confidence, lighting_info=None):
+        """
+        Use new implement
+        """
+        # 獲取功能區域信息（如果需要的話）
+        functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, scene_type)
+        # 使用增強的場景描述生成器
+        return self.generate_scene_description(
+            scene_type,
+            detected_objects,
+            confidence,
+            lighting_info,
+            functional_zones
+        )
+    def _define_image_regions(self):
+        """Define regions of the image for spatial analysis (3x3 grid)"""
+        self.regions = {
+            "top_left": (0, 0, 1/3, 1/3),
+            "top_center": (1/3, 0, 2/3, 1/3),
+            "top_right": (2/3, 0, 1, 1/3),
+            "middle_left": (0, 1/3, 1/3, 2/3),
+            "middle_center": (1/3, 1/3, 2/3, 2/3),
+            "middle_right": (2/3, 1/3, 1, 2/3),
+            "bottom_left": (0, 2/3, 1/3, 1),
+            "bottom_center": (1/3, 2/3, 2/3, 1),
+            "bottom_right": (2/3, 2/3, 1, 1)
+        }
+    def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
+        """
+        Analyze detection results to determine scene type and provide understanding.
+        Args:
+            detection_result: Detection result from YOLOv8
+            lighting_info: Optional lighting condition analysis results
+            class_confidence_threshold: Minimum confidence to consider an object
+            scene_confidence_threshold: Minimum confidence to determine a scene
+        Returns:
+            Dictionary with scene analysis results
+        """
+        # If no result or no detections, return empty analysis
+        if detection_result is None or len(detection_result.boxes) == 0:
+            return {
+                "scene_type": "unknown",
+                "confidence": 0,
+                "description": "No objects detected in the image.",
+                "objects_present": [],
+                "object_count": 0,
+                "regions": {},
+                "possible_activities": [],
+                "safety_concerns": [],
+                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
+            }
+        # Get class names from detection result if not already set
+        if self.class_names is None:
+            self.class_names = detection_result.names
+            # Also update class names in spatial analyzer
+            self.spatial_analyzer.class_names = self.class_names
+        # Extract detected objects with confidence above threshold
+        detected_objects = self.spatial_analyzer._extract_detected_objects(
+            detection_result,
+            confidence_threshold=class_confidence_threshold
+        )
+        # No objects above confidence threshold
+        if not detected_objects:
+            return {
+                "scene_type": "unknown",
+                "confidence": 0.0,
+                "description": "No objects with sufficient confidence detected.",
+                "objects_present": [],
+                "object_count": 0,
+                "regions": {},
+                "possible_activities": [],
+                "safety_concerns": [],
+                "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
+            }
+        # Analyze object distribution in regions
+        region_analysis = self.spatial_analyzer._analyze_regions(detected_objects)
+        # Compute scene type scores based on object detection
+        yolo_scene_scores = self._compute_scene_scores(detected_objects)
+        # 使用 CLIP 分析圖像
+        clip_scene_scores = {}
+        clip_analysis = None
+        if self.use_clip:
+            try:
+                # 獲取原始圖像
+                original_image = detection_result.orig_img
+                # Use CLIP analyze image
+                clip_analysis = self.clip_analyzer.analyze_image(original_image)
+                # get CLIP's score
+                clip_scene_scores = clip_analysis.get("scene_scores", {})
+                if "asian_commercial_street" in clip_scene_scores and clip_scene_scores["asian_commercial_street"] > 0.2:
+                    # 使用對比提示進一步區分室內/室外
+                    comparative_results = self.clip_analyzer.calculate_similarity(
+                        original_image,
+                        self.clip_analyzer.comparative_prompts["indoor_vs_outdoor"]
+                    )
+                    # 分析對比結果
+                    indoor_score = sum(s for p, s in comparative_results.items() if "indoor" in p or "enclosed" in p)
+                    outdoor_score = sum(s for p, s in comparative_results.items() if "outdoor" in p or "open-air" in p)
+                    # 如果 CLIP 認為這是室外場景，且光照分析認為是室內
+                    if outdoor_score > indoor_score and lighting_info and lighting_info.get("is_indoor", False):
+                        # 修正光照分析結果
+                        print(f"CLIP indicates outdoor commercial street (score: {outdoor_score:.2f} vs {indoor_score:.2f}), adjusting lighting analysis")
+                        lighting_info["is_indoor"] = False
+                        lighting_info["indoor_probability"] = 0.3
+                        # 把CLIP 分析結果加到光照診斷
+                        if "diagnostics" not in lighting_info:
+                            lighting_info["diagnostics"] = {}
+                        lighting_info["diagnostics"]["clip_override"] = {
+                            "reason": "CLIP detected outdoor commercial street",
+                            "outdoor_score": float(outdoor_score),
+                            "indoor_score": float(indoor_score)
+                        }
+                # 如果 CLIP 檢測到了光照條件但沒有提供 lighting_info
+                if not lighting_info and "lighting_condition" in clip_analysis:
+                    lighting_type, lighting_conf = clip_analysis["lighting_condition"]
+                    lighting_info = {
+                        "time_of_day": lighting_type,
+                        "confidence": lighting_conf
+                    }
+            except Exception as e:
+                print(f"Error in CLIP analysis: {e}")
+        # 融合 YOLO 和 CLIP 的場景分數
+        scene_scores = self._fuse_scene_scores(yolo_scene_scores, clip_scene_scores)
+        # Determine best matching scene type
+        best_scene, scene_confidence = self._determine_scene_type(scene_scores)
+        # Generate possible activities based on scene
+        activities = self.descriptor._infer_possible_activities(best_scene, detected_objects)
+        # Identify potential safety concerns
+        safety_concerns = self.descriptor._identify_safety_concerns(detected_objects, best_scene)
+        # Calculate functional zones
+        functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, best_scene)
+        # Generate scene description
+        scene_description = self.generate_scene_description(
+            best_scene,
+            detected_objects,
+            scene_confidence,
+            lighting_info=lighting_info,
+            functional_zones=functional_zones
+        )
+        # Return comprehensive analysis
+        result = {
+            "scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
+            "scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown")
+                        if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
+            "confidence": scene_confidence,
+            "description": scene_description,
+            "objects_present": [
+                {"class_id": obj["class_id"],
+                "class_name": obj["class_name"],
+                "confidence": obj["confidence"]}
+                for obj in detected_objects
+            ],
+            "object_count": len(detected_objects),
+            "regions": region_analysis,
+            "possible_activities": activities,
+            "safety_concerns": safety_concerns,
+            "functional_zones": functional_zones,
+            "alternative_scenes": self.descriptor._get_alternative_scenes(scene_scores, scene_confidence_threshold, top_k=2),
+            "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
+        }
+        # 添加 CLIP 特定的結果（新增）
+        if clip_analysis and "error" not in clip_analysis:
+            result["clip_analysis"] = {
+                "top_scene": clip_analysis.get("top_scene", ("unknown", 0.0)),
+                "cultural_analysis": clip_analysis.get("cultural_analysis", {})
+            }
+        return result
+    def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
+        """
+        Compute confidence scores for each scene type based on detected objects.
+        Args:
+            detected_objects: List of detected objects
+        Returns:
+            Dictionary mapping scene types to confidence scores
+        """
+        scene_scores = {}
+        detected_class_ids = [obj["class_id"] for obj in detected_objects]
+        detected_classes_set = set(detected_class_ids)
+        # Count occurrence of each class
+        class_counts = {}
+        for obj in detected_objects:
+            class_id = obj["class_id"]
+            if class_id not in class_counts:
+                class_counts[class_id] = 0
+            class_counts[class_id] += 1
+        # Evaluate each scene type
+        for scene_type, scene_def in self.SCENE_TYPES.items():
+            # Count required objects present
+            required_objects = set(scene_def["required_objects"])
+            required_present = required_objects.intersection(detected_classes_set)
+            # Count optional objects present
+            optional_objects = set(scene_def["optional_objects"])
+            optional_present = optional_objects.intersection(detected_classes_set)
+            # Skip if minimum required objects aren't present
+            if len(required_present) < scene_def["minimum_required"]:
+                scene_scores[scene_type] = 0
+                continue
+            # Base score from required objects
+            required_ratio = len(required_present) / max(1, len(required_objects))
+            required_score = required_ratio * 0.7  # 70% of score from required objects
+            # Additional score from optional objects
+            optional_ratio = len(optional_present) / max(1, len(optional_objects))
+            optional_score = optional_ratio * 0.3  # 30% of score from optional objects
+            # Bonus for having multiple instances of key objects
+            multiple_bonus = 0.0
+            for class_id in required_present:
+                if class_counts.get(class_id, 0) > 1:
+                    multiple_bonus += 0.05  # 5% bonus per additional key object type
+            # Cap the bonus at 15%
+            multiple_bonus = min(0.15, multiple_bonus)
+            # Calculate final score
+            final_score = required_score + optional_score + multiple_bonus
+            if "priority" in scene_def:
+                final_score *= scene_def["priority"]
+            # Normalize to 0-1 range
+            scene_scores[scene_type] = min(1.0, final_score)
+        return scene_scores
+    def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
+        """
+        Determine the most likely scene type based on scores.
+        Args:
+            scene_scores: Dictionary mapping scene types to confidence scores
+        Returns:
+            Tuple of (best_scene_type, confidence)
+        """
+        if not scene_scores:
+            return "unknown", 0
+        # Find scene with highest score
+        best_scene = max(scene_scores, key=scene_scores.get)
+        best_score = scene_scores[best_scene]
+        return best_scene, best_score
+    def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
+        """
+        融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
+        Args:
+            yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
+            clip_scene_scores: 基於 CLIP 分析的場景分數
+        Returns:
+            Dict: 融合後的場景分數
+        """
+        # 如果沒有 CLIP 分數，直接返回 YOLO 分數
+        if not clip_scene_scores:
+            return yolo_scene_scores
+        # 如果沒有 YOLO 分數，直接返回 CLIP 分數
+        if not yolo_scene_scores:
+            return clip_scene_scores
+        # 融合分數
+        fused_scores = {}
+        # 獲取所有場景類型
+        all_scene_types = set(list(yolo_scene_scores.keys()) + list(clip_scene_scores.keys()))
+        for scene_type in all_scene_types:
+            # 獲取兩個模型的分數
+            yolo_score = yolo_scene_scores.get(scene_type, 0.0)
+            clip_score = clip_scene_scores.get(scene_type, 0.0)
+            # 設置基本權重
+            yolo_weight = 0.7  # YOLO 提供更詳細的物體資訊
+            clip_weight = 0.3  # CLIP 提供更好的整體場景理解
+            # 對特定類型場景調整權重
+            # 文化特定場景或具有特殊布局的場景，CLIP 可能有優勢
+            if any(keyword in scene_type for keyword in ["asian", "cultural", "aerial"]):
+                yolo_weight = 0.3
+                clip_weight = 0.7
+            # 對室內家居場景，物體檢測通常更準確
+            elif any(keyword in scene_type for keyword in ["room", "kitchen", "office", "bedroom"]):
+                yolo_weight = 0.8
+                clip_weight = 0.2
+            elif scene_type == "beach_water_recreation":
+                yolo_weight = 0.8  # 衝浪板等特定物品的檢測非常重要
+                clip_weight = 0.2
+            elif scene_type == "sports_venue":
+                yolo_weight = 0.7
+                clip_weight = 0.3
+            elif scene_type == "professional_kitchen":
+                yolo_weight = 0.8  # 廚房用具的檢測非常重要
+                clip_weight = 0.2
+            # 計算加權分數
+            fused_scores[scene_type] = (yolo_score * yolo_weight) + (clip_score * clip_weight)
+        return fused_scores

scene_description.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import os
+import json
+from typing import Dict, List, Tuple, Any, Optional
+from scene_type import SCENE_TYPES
+from scene_detail_templates import SCENE_DETAIL_TEMPLATES
+from object_template_fillers import OBJECT_TEMPLATE_FILLERS
+from activity_templates import ACTIVITY_TEMPLATES
+from safety_templates import SAFETY_TEMPLATES
+from confifence_templates import CONFIDENCE_TEMPLATES
+class SceneDescriptor:
+    """
+    Generates natural language descriptions of scenes.
+    Handles scene descriptions, activity inference, and safety concerns identification.
+    """
+    def __init__(self, scene_types=None, object_categories=None):
+        """
+        Initialize the scene descriptor
+        Args:
+            scene_types: Dictionary of scene type definitions
+        """
+        self.scene_types = scene_types or {}
+        self.SCENE_TYPES = scene_types or {}
+        if object_categories:
+            self.OBJECT_CATEGORIES = object_categories
+        else:
+            # 從 JSON 加載或使用默認值
+            self.OBJECT_CATEGORIES = self._load_json_data("object_categories") or {
+                "furniture": [56, 57, 58, 59, 60, 61],
+                "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
+                "kitchen_items": [39, 40, 41, 42, 43, 44, 45],
+                "food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
+                "vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
+                "personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
+            }
+        # 加載所有模板數據
+        self._load_templates()
+    def _load_templates(self):
+        """Load all template data from script or fallback to imported defaults"""
+        self.confidence_templates = CONFIDENCE_TEMPLATES
+        self.scene_detail_templates = SCENE_DETAIL_TEMPLATES
+        self.object_template_fillers = OBJECT_TEMPLATE_FILLERS
+        self.safety_templates = SAFETY_TEMPLATES
+        self.activity_templates = ACTIVITY_TEMPLATES
+    def _initialize_fallback_templates(self):
+        """Initialize fallback templates when no external data is available"""
+        # 只在無法從文件或導入加載時使用
+        self.confidence_templates = {
+            "high": "{description} {details}",
+            "medium": "This appears to be {description} {details}",
+            "low": "This might be {description}, but the confidence is low. {details}"
+        }
+        # 僅提供最基本的模板作為後備
+        self.scene_detail_templates = {
+            "default": ["A space with various objects."]
+        }
+        self.object_template_fillers = {
+            "default": ["various items"]
+        }
+        self.safety_templates = {
+            "general": "Pay attention to {safety_element}."
+        }
+        self.activity_templates = {
+            "default": ["General activity"]
+        }
+    def _get_alternative_scenes(self, scene_scores: Dict[str, float],
+                            threshold: float, top_k: int = 2) -> List[Dict]:
+        """
+        Get alternative scene interpretations with their scores.
+        Args:
+            scene_scores: Dictionary of scene type scores
+            threshold: Minimum confidence threshold
+            top_k: Number of alternatives to return
+        Returns:
+            List of dictionaries with alternative scenes
+        """
+        # Sort scenes by score in descending order
+        sorted_scenes = sorted(scene_scores.items(), key=lambda x: x[1], reverse=True)
+        # Skip the first one (best match) and take the next top_k
+        alternatives = []
+        for scene_type, score in sorted_scenes[1:1+top_k]:
+            if score >= threshold:
+                alternatives.append({
+                    "type": scene_type,
+                    "name": self.SCENE_TYPES.get(scene_type, {}).get("name", "Unknown"),
+                    "confidence": score
+                })
+        return alternatives
+    def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
+        """
+        Infer possible activities based on scene type and detected objects.
+        Args:
+            scene_type: Identified scene type
+            detected_objects: List of detected objects
+        Returns:
+            List of possible activities
+        """
+        activities = []
+        if scene_type.startswith("aerial_view_"):
+            if scene_type == "aerial_view_intersection":
+                # 使用預定義的十字路口活動
+                activities.extend(self.activity_templates.get("aerial_view_intersection", []))
+                # 添加與行人和車輛相關的特定活動
+                pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
+                vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]  # Car, bus, truck
+                if pedestrians and vehicles:
+                    activities.append("Waiting for an opportunity to cross the street")
+                    activities.append("Obeying traffic signals")
+            elif scene_type == "aerial_view_commercial_area":
+                activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
+            elif scene_type == "aerial_view_plaza":
+                activities.extend(self.activity_templates.get("aerial_view_plaza", []))
+            else:
+                # 處理其他未明確定義的空中視角場景
+                aerial_activities = [
+                    "Street crossing",
+                    "Waiting for signals",
+                    "Following traffic rules",
+                    "Pedestrian movement"
+                ]
+                activities.extend(aerial_activities)
+        if scene_type in self.activity_templates:
+            activities.extend(self.activity_templates[scene_type])
+        elif "default" in self.activity_templates:
+            activities.extend(self.activity_templates["default"])
+        detected_class_ids = [obj["class_id"] for obj in detected_objects]
+        # Add activities based on specific object combinations
+        if 62 in detected_class_ids and 57 in detected_class_ids:  # TV and sofa
+            activities.append("Watching shows or movies")
+        if 63 in detected_class_ids:  # laptop
+            activities.append("Using a computer/laptop")
+        if 67 in detected_class_ids:  # cell phone
+            activities.append("Using a mobile phone")
+        if 73 in detected_class_ids:  # book
+            activities.append("Reading")
+        if any(food_id in detected_class_ids for food_id in [46, 47, 48, 49, 50, 51, 52, 53, 54, 55]):
+            activities.append("Eating or preparing food")
+        # Person-specific activities
+        if 0 in detected_class_ids:  # Person
+            if any(vehicle in detected_class_ids for vehicle in [1, 2, 3, 5, 7]):  # Vehicles
+                activities.append("Commuting or traveling")
+            if 16 in detected_class_ids:  # Dog
+                activities.append("Walking a dog")
+            if 24 in detected_class_ids or 26 in detected_class_ids:  # Backpack or handbag
+                activities.append("Carrying personal items")
+        # Remove duplicates
+        return list(set(activities))
+    def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
+        """
+        Identify potential safety concerns based on objects and scene type.
+        Args:
+            detected_objects: List of detected objects
+            scene_type: Identified scene type
+        Returns:
+            List of potential safety concerns
+        """
+        concerns = []
+        detected_class_ids = [obj["class_id"] for obj in detected_objects]
+        # ORIGINAL SAFETY CONCERNS LOGIC
+        # General safety concerns
+        if 42 in detected_class_ids or 43 in detected_class_ids:  # Fork or knife
+            concerns.append("Sharp utensils present")
+        if 76 in detected_class_ids:  # Scissors
+            concerns.append("Cutting tools present")
+        # Traffic-related concerns
+        if scene_type in ["city_street", "parking_lot"]:
+            if 0 in detected_class_ids:  # Person
+                if any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7, 8]):  # Vehicles
+                    concerns.append("Pedestrians near vehicles")
+            if 9 in detected_class_ids:  # Traffic light
+                concerns.append("Monitor traffic signals")
+        # Identify crowded scenes
+        person_count = detected_class_ids.count(0)
+        if person_count > 5:
+            concerns.append(f"Crowded area with multiple people ({person_count})")
+        # Scene-specific concerns
+        if scene_type == "kitchen":
+            if 68 in detected_class_ids or 69 in detected_class_ids:  # Microwave or oven
+                concerns.append("Hot cooking equipment")
+        # Potentially unstable objects
+        for obj in detected_objects:
+            if obj["class_id"] in [39, 40, 41, 45]:  # Bottle, wine glass, cup, bowl
+                if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
+                    concerns.append(f"Elevated {obj['class_name']} might be unstable")
+        # NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES
+        # Upscale dining safety concerns
+        if scene_type == "upscale_dining":
+            # Check for fragile items
+            if 40 in detected_class_ids:  # Wine glass
+                concerns.append("Fragile glassware present")
+            # Check for lit candles (can't directly detect but can infer from context)
+            # Look for small bright spots that might be candles
+            if any(obj["class_id"] == 41 for obj in detected_objects):  # Cup (which might include candle holders)
+                # We can't reliably detect candles, but if the scene appears to be formal dining,
+                # we can suggest this as a possibility
+                concerns.append("Possible lit candles or decorative items requiring care")
+            # Check for overcrowded table
+            table_objs = [obj for obj in detected_objects if obj["class_id"] == 60]  # Dining table
+            if table_objs:
+                table_region = table_objs[0]["region"]
+                items_on_table = 0
+                for obj in detected_objects:
+                    if obj["class_id"] in [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]:
+                        if obj["region"] == table_region:
+                            items_on_table += 1
+                if items_on_table > 8:
+                    concerns.append("Dining table has multiple items which should be handled with care")
+        # Asian commercial street safety concerns
+        elif scene_type == "asian_commercial_street":
+            # Check for crowded walkways
+            if 0 in detected_class_ids:  # Person
+                person_count = detected_class_ids.count(0)
+                if person_count > 3:
+                    # Calculate person density (simplified)
+                    person_positions = []
+                    for obj in detected_objects:
+                        if obj["class_id"] == 0:
+                            person_positions.append(obj["normalized_center"])
+                    if len(person_positions) >= 2:
+                        # Calculate average distance between people
+                        total_distance = 0
+                        count = 0
+                        for i in range(len(person_positions)):
+                            for j in range(i+1, len(person_positions)):
+                                p1 = person_positions[i]
+                                p2 = person_positions[j]
+                                distance = ((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)**0.5
+                                total_distance += distance
+                                count += 1
+                        if count > 0:
+                            avg_distance = total_distance / count
+                            if avg_distance < 0.1:  # Close proximity
+                                concerns.append("Crowded walkway with limited personal space")
+            # Check for motorcycles/bicycles near pedestrians
+            if (1 in detected_class_ids or 3 in detected_class_ids) and 0 in detected_class_ids:  # Bicycle/motorcycle and person
+                concerns.append("Two-wheeled vehicles in pedestrian areas")
+            # Check for potential trip hazards
+            # We can't directly detect this, but can infer from context
+            if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
+                # If people are in bottom regions, they might be walking on uneven surfaces
+                concerns.append("Potential uneven walking surfaces in commercial area")
+        # Financial district safety concerns
+        elif scene_type == "financial_district":
+            # Check for heavy traffic conditions
+            vehicle_count = sum(1 for obj_id in detected_class_ids if obj_id in [2, 5, 7])  # Car, bus, truck
+            if vehicle_count > 5:
+                concerns.append("Heavy vehicle traffic in urban area")
+            # Check for pedestrians crossing busy streets
+            if 0 in detected_class_ids:  # Person
+                person_count = detected_class_ids.count(0)
+                vehicle_nearby = any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7])
+                if person_count > 0 and vehicle_nearby:
+                    concerns.append("Pedestrians navigating busy urban traffic")
+            # Check for traffic signals
+            if 9 in detected_class_ids:  # Traffic light
+                concerns.append("Observe traffic signals when navigating this area")
+            else:
+                # If no traffic lights detected but it's a busy area, it's worth noting
+                if vehicle_count > 3:
+                    concerns.append("Busy traffic area potentially without visible traffic signals in view")
+            # Time of day considerations
+            # We don't have direct time data, but can infer from vehicle lights
+            vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
+            if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
+                # If vehicles are present and it might be evening/night
+                concerns.append("Reduced visibility conditions during evening commute")
+        # Urban intersection safety concerns
+        elif scene_type == "urban_intersection":
+            # Check for pedestrians in crosswalks
+            pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+            vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 3, 5, 7]]
+            if pedestrian_objs:
+                # Calculate distribution of pedestrians to see if they're crossing
+                pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
+                # Simplified check for pedestrians in crossing pattern
+                if len(pedestrian_positions) >= 3:
+                    # Check if pedestrians are distributed across different regions
+                    pedestrian_regions = set(obj["region"] for obj in pedestrian_objs)
+                    if len(pedestrian_regions) >= 2:
+                        concerns.append("Multiple pedestrians crossing the intersection")
+            # Check for traffic signal observation
+            if 9 in detected_class_ids:  # Traffic light
+                concerns.append("Observe traffic signals when crossing")
+            # Check for busy intersection
+            if len(vehicle_objs) > 3:
+                concerns.append("Busy intersection with multiple vehicles")
+            # Check for pedestrians potentially jay-walking
+            if pedestrian_objs and not 9 in detected_class_ids:  # People but no traffic lights
+                concerns.append("Pedestrians should use designated crosswalks")
+            # Visibility concerns based on lighting
+            # This would be better with actual lighting data
+            pedestrian_count = len(pedestrian_objs)
+            if pedestrian_count > 5:
+                concerns.append("High pedestrian density at crossing points")
+        # Transit hub safety concerns
+        elif scene_type == "transit_hub":
+            # These would be for transit areas like train stations or bus terminals
+            if 0 in detected_class_ids:  # Person
+                person_count = detected_class_ids.count(0)
+                if person_count > 8:
+                    concerns.append("Crowded transit area requiring careful navigation")
+            # Check for luggage/bags that could be trip hazards
+            if 24 in detected_class_ids or 28 in detected_class_ids:  # Backpack or suitcase
+                concerns.append("Luggage and personal items may create obstacles")
+            # Public transportation vehicles
+            if any(vehicle in detected_class_ids for vehicle in [5, 6, 7]):  # Bus, train, truck
+                concerns.append("Stay clear of arriving and departing transit vehicles")
+        # Shopping district safety concerns
+        elif scene_type == "shopping_district":
+            # Check for crowded shopping areas
+            if 0 in detected_class_ids:  # Person
+                person_count = detected_class_ids.count(0)
+                if person_count > 5:
+                    concerns.append("Crowded shopping area with multiple people")
+            # Check for shopping bags and personal items
+            if 24 in detected_class_ids or 26 in detected_class_ids:  # Backpack or handbag
+                concerns.append("Mind personal belongings in busy retail environment")
+            # Check for store entrances/exits which might have automatic doors
+            # We can't directly detect this, but can infer from context
+            if scene_type == "shopping_district" and 0 in detected_class_ids:
+                concerns.append("Be aware of store entrances and exits with potential automatic doors")
+        return concerns

scene_detail_templates.py ADDED Viewed

	@@ -0,0 +1,203 @@

+SCENE_DETAIL_TEMPLATES = {
+            "living_room": [
+                "The space is arranged for relaxation with {furniture}.",
+                "There is {electronics} for entertainment.",
+                "The room has a seating area with {seating}."
+            ],
+            "bedroom": [
+                "The room contains {bed_type} in the {bed_location}.",
+                "This sleeping area has {bed_description}.",
+                "A personal space with {bed_type} and {extras}."
+            ],
+            "dining_area": [
+                "A space set up for meals with {table_setup}.",
+                "The dining area contains {table_description}.",
+                "A place for eating with {dining_items}."
+            ],
+            "kitchen": [
+                "A food preparation area with {appliances}.",
+                "The kitchen contains {kitchen_items}.",
+                "A cooking space equipped with {cooking_equipment}."
+            ],
+            "office_workspace": [
+                "A work environment with {office_equipment}.",
+                "A space designed for productivity with {desk_setup}.",
+                "A workspace containing {computer_equipment}."
+            ],
+            "city_street": [
+                "An urban thoroughfare with {traffic_description}.",
+                "A street scene with {people_and_vehicles}.",
+                "A city path with {street_elements}."
+            ],
+            "park_area": [
+                "An outdoor recreational space with {park_features}.",
+                "A leisure area featuring {outdoor_elements}.",
+                "A public outdoor space with {park_description}."
+            ],
+            "retail_store": [
+                "A shopping environment with {store_elements}.",
+                "A commercial space where {shopping_activity}.",
+                "A retail area containing {store_items}."
+            ],
+            "upscale_dining": [
+            "The space features {furniture} with {design_elements} for an elegant dining experience.",
+            "This sophisticated dining area includes {lighting} illuminating {table_setup}.",
+            "A stylish dining environment with {seating} arranged around {table_description}."
+            ],
+            "asian_commercial_street": [
+                "A vibrant street lined with {storefront_features} and filled with {pedestrian_flow}.",
+                "This urban commercial area displays {asian_elements} with {cultural_elements}.",
+                "A lively shopping street characterized by {signage} and busy with {street_activities}."
+            ],
+            "financial_district": [
+                "A canyon of {buildings} with {traffic_elements} moving through the urban landscape.",
+                "This business district features {skyscrapers} along {road_features}.",
+                "A downtown corridor with {architectural_elements} framing views of {city_landmarks}."
+            ],
+            "urban_intersection": [
+                "A busy crossroad with {crossing_pattern} where {pedestrian_behavior} is observed.",
+                "This urban junction features {pedestrian_density} navigating the {traffic_pattern}.",
+                "A well-marked intersection designed for {pedestrian_flow} across multiple directions."
+            ],
+            "transit_hub": [
+                "A transportation nexus where {transit_vehicles} arrive and depart amid {passenger_activity}.",
+                "This transit center accommodates {transportation_modes} with facilities for {passenger_needs}.",
+                "A busy transport hub featuring {transit_infrastructure} and areas for {passenger_movement}."
+            ],
+            "shopping_district": [
+                "A commercial zone filled with {retail_elements} and {shopping_activity}.",
+                "This shopping area features {store_types} along {walkway_features}.",
+                "A retail district characterized by {commercial_signage} and {consumer_behavior}."
+            ],
+            "bus_stop": [
+                "Passengers waiting at a roadside stop served by {transit_vehicles}.",
+                "A designated bus stop with shelters and {passenger_activity}.",
+                "Commuters boarding or alighting from {transit_vehicles} at the curb."
+            ],
+            "bus_station": [
+                "Multiple buses parked in a terminal where {passenger_activity}.",
+                "A busy station hub featuring {transit_vehicles} and traveler luggage.",
+                "A transit center with waiting areas and various {transportation_modes}."
+            ],
+            "zoo": [
+                "Enclosures showcasing elephants, zebras, and giraffes with visitors observing.",
+                "A wildlife exhibit area where families watch animal displays.",
+                "A recreational space featuring large animal exhibits and strolling guests."
+            ],
+            "harbor": [
+                "Boats docked along the waterfront with nearby vehicular traffic.",
+                "A maritime area where vessels anchor beside roads busy with cars and motorcycles.",
+                "A coastal dock featuring moored boats and passing traffic elements."
+            ],
+            "playground": [
+                "An open play area equipped with balls and recreational structures.",
+                "People engaging in games and sports in a communal space.",
+                "A leisure area featuring playground equipment and active participants."
+            ],
+            "sports_field": [
+                "An athletic field marked for various ball games and matches.",
+                "Players using equipment like bats, gloves, and rackets on a grassy pitch.",
+                "A designated sports area with goalposts or markings for competitive play."
+            ],
+            "narrow_commercial_alley": [
+                "A tight alley lined with {storefront_features} and light vehicles.",
+                "Pedestrians navigate a confined lane flanked by shops and {street_activities}.",
+                "An urban passage featuring {storefront_features} with {people_and_vehicles}."
+            ],
+            "daytime_shopping_street": [
+                "A bustling street during daytime with {storefront_features} and {pedestrian_flow}.",
+                "Shoppers and vehicles move along a retail strip marked by {signage}.",
+                "An open commercial avenue filled with {people_and_vehicles} amid shops."
+            ],
+            "urban_pedestrian_crossing": [
+                "A marked crosswalk with {crossing_pattern} under {lighting_modifier} sky.",
+                "Pedestrians use designated crossing with {traffic_pattern} at the intersection.",
+                "People waiting at a signal-controlled crossing next to {street_elements}."
+            ],
+            "aerial_view_intersection": [
+                "The crossing pattern shows {crossing_pattern} with {pedestrian_flow} across multiple directions.",
+                "From above, this intersection reveals {traffic_pattern} with {pedestrian_density} navigating through defined paths.",
+                "This bird's-eye view shows {street_elements} converging at a junction where {pedestrian_behavior} is visible."
+            ],
+            "aerial_view_commercial_area": [
+                "From above, this commercial zone shows {storefront_features} with {pedestrian_flow} moving between establishments.",
+                "This overhead view reveals {shopping_activity} amid {walkway_features} connecting different businesses.",
+                "The aerial perspective captures {retail_elements} organized along {commercial_layout} with visible customer activity."
+            ],
+            "aerial_view_plaza": [
+                "This overhead view of the plaza shows {pedestrian_pattern} across an open public space.",
+                "From above, the plaza reveals {gathering_features} where people congregate in {movement_pattern}.",
+                "The aerial perspective captures {urban_elements} arranged around a central area where {public_activity} occurs."
+            ],
+            "asian_night_market": [
+                "This bustling night market features {stall_elements} illuminated by {lighting_features} with crowds enjoying {food_elements}.",
+                "Rows of {vendor_stalls} line this vibrant market where {nighttime_activity} continues under {cultural_lighting}.",
+                "The market atmosphere is created by {asian_elements} and {night_market_sounds} amid {evening_crowd_behavior}."
+            ],
+            "asian_temple_area": [
+                "This sacred space features {architectural_elements} displaying {cultural_symbols} with visitors engaging in {ritual_activities}.",
+                "The temple area contains {religious_structures} adorned with {decorative_features} where people practice {cultural_practices}.",
+                "Traditional {temple_architecture} creates a spiritual atmosphere enhanced by {sensory_elements} and {visitor_activities}."
+            ],
+            "european_plaza": [
+                "This historic plaza is framed by {architectural_style} surrounding an open space where {public_activities} take place.",
+                "The European square features {historic_elements} and {urban_design} creating a space for {social_behaviors}.",
+                "Classical {european_features} define this public space where {tourist_activities} blend with {local_customs}."
+            ],
+            "nighttime_street": [
+                "The night transforms this street with {lighting_effects} casting {shadow_patterns} across {urban_features}.",
+                "After dark, this urban corridor is defined by {illuminated_elements} with {evening_activities} visible in the artificial light.",
+                "The nocturnal street scene captures {light_sources} creating contrast between {lit_areas} and {shadowed_zones}."
+            ],
+            "nighttime_commercial_district": [
+                "After sunset, this commercial area comes alive with {illuminated_signage} and {evening_activities} under {colorful_lighting}.",
+                "The district's nighttime character is defined by {neon_elements} highlighting {storefront_features} amid {night_crowd_behavior}.",
+                "Evening transforms this zone through {light_displays} that accentuate {building_features} and frame {nightlife_activities}."
+            ],
+            "indoor_outdoor_cafe": [
+                "This cafe blends indoor comfort with outdoor atmosphere through {transitional_elements} connecting {indoor_features} with {outdoor_setting}.",
+                "Customers enjoy both {interior_amenities} and {exterior_features} in this space that bridges indoor comfort and outdoor ambiance.",
+                "The cafe design creates flow between {inside_elements} and {outside_spaces} allowing patrons to experience {dual_environment_benefits}."
+            ],
+            "transit_station_platform": [
+                "This transit platform combines covered areas with open sections where {passenger_activities} occur while awaiting {transportation_types}.",
+                "The station design balances {sheltered_elements} with {exposed_areas} for passengers engaged in {waiting_behaviors}.",
+                "Commuters navigate between {indoor_facilities} and {platform_features} while {transit_routines} unfold around arriving vehicles."
+            ],
+            "sports_stadium": [
+                "This athletic venue features {seating_arrangement} surrounding {playing_surface} where {sporting_activities} take place.",
+                "The stadium design incorporates {spectator_facilities} overlooking {competition_space} designed for {sports_events}.",
+                "Fans occupy {viewing_areas} arranged to maximize visibility of {field_elements} where athletes engage in {game_activities}."
+            ],
+            "construction_site": [
+                "This development area shows {construction_equipment} amid {building_materials} where workers conduct {construction_activities}.",
+                "The construction process is visible through {work_elements} positioned around {structural_components} in various stages of completion.",
+                "Workers utilize {site_equipment} to transform {raw_materials} following {construction_process} stages."
+            ],
+            "medical_facility": [
+                "This healthcare environment features {medical_elements} arranged to support {clinical_activities} in a {facility_design}.",
+                "The medical space incorporates {healthcare_features} where {patient_interactions} occur in a controlled environment.",
+                "Professional medical staff utilize {equipment_types} while conducting {care_procedures} in specialized {treatment_spaces}."
+            ],
+            "educational_setting": [
+                "This learning environment contains {educational_furniture} arranged to facilitate {learning_activities} through {instructional_design}.",
+                "The educational space features {classroom_elements} organized for {teaching_methods} and {student_engagement}.",
+                "Students and educators interact within {learning_spaces} equipped with {educational_tools} supporting {knowledge_transfer}."
+            ],
+            "beach_water_recreation": [
+                "A coastal recreation area with {beach_equipment} and people enjoying {water_activities}.",
+                "This shoreline space features {beach_equipment} where visitors engage in {water_activities}.",
+                "An outdoor water recreation zone with {beach_equipment} set up for {water_activities}."
+            ],
+            "sports_venue": [
+                "A professional sports facility with {sports_equipment} arranged for {competitive_activities}.",
+                "This athletics venue features {sports_equipment} with spaces designated for {competitive_activities}.",
+                "A specialized sports arena containing {sports_equipment} designed for {competitive_activities}."
+            ],
+            "professional_kitchen": [
+                "A commercial cooking space with {kitchen_equipment} organized for {food_preparation}.",
+                "This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
+                "An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
+            ],
+        }

scene_type.py ADDED Viewed

	@@ -0,0 +1,394 @@

+SCENE_TYPES = {
+    "living_room": {
+        "name": "Living Room",
+        "required_objects": [57, 62],  # couch, tv
+        "optional_objects": [56, 60, 73, 75],  # chair, dining table, book, vase
+        "minimum_required": 2,
+        "description": "A living room area with furniture for relaxation and entertainment"
+    },
+    "bedroom": {
+        "name": "Bedroom",
+        "required_objects": [59],  # bed
+        "optional_objects": [56, 60, 73, 74, 75],  # chair, dining table, book, clock, vase
+        "minimum_required": 1,
+        "description": "A bedroom with sleeping furniture"
+    },
+    "dining_area": {
+        "name": "Dining Area",
+        "required_objects": [60],  # dining table
+        "optional_objects": [56, 39, 41, 42, 43, 44, 45],  # chair, bottle, cup, fork, knife, spoon, bowl
+        "minimum_required": 1,
+        "description": "A dining area for meals"
+    },
+    "kitchen": {
+        "name": "Kitchen",
+        "required_objects": [72, 68, 69, 71],  # refrigerator, microwave, oven, sink
+        "optional_objects": [39, 41, 42, 43, 44, 45],  # bottle, cup, fork, knife, spoon, bowl
+        "minimum_required": 1,
+        "description": "A kitchen area for food preparation"
+    },
+    "office_workspace": {
+        "name": "Office Workspace",
+        "required_objects": [56, 63, 66, 64, 73],  # chair, laptop, keyboard, mouse, book
+        "optional_objects": [60, 74, 75, 67],  # dining table, clock, vase, cell phone
+        "minimum_required": 2,
+        "description": "A workspace with computer equipment for office work"
+    },
+    "meeting_room": {
+        "name": "Meeting Room",
+        "required_objects": [56, 60],  # chair, dining table
+        "optional_objects": [63, 62, 67],  # laptop, tv, cell phone
+        "minimum_required": 2,
+        "description": "A room set up for meetings with multiple seating"
+    },
+    "city_street": {
+        "name": "City Street",
+        "required_objects": [0, 1, 2, 3, 5, 7, 9],  # person, bicycle, car, motorcycle, bus, truck, traffic light
+        "optional_objects": [10, 11, 12, 24, 25, 26, 28],  # fire hydrant, stop sign, parking meter, backpack, umbrella, handbag, suitcase
+        "minimum_required": 2,
+        "description": "A city street with traffic and pedestrians"
+    },
+    "parking_lot": {
+        "name": "Parking Lot",
+        "required_objects": [2, 3, 5, 7],  # car, motorcycle, bus, truck
+        "optional_objects": [0, 11, 12],  # person, stop sign, parking meter
+        "minimum_required": 3,
+        "description": "A parking area with multiple vehicles"
+    },
+    "park_area": {
+        "name": "Park or Recreation Area",
+        "required_objects": [0, 13],  # person, bench
+        "optional_objects": [1, 14, 16, 25, 33],  # bicycle, bird, dog, umbrella, kite
+        "minimum_required": 2,
+        "description": "An outdoor recreational area for leisure activities"
+    },
+    "retail_store": {
+        "name": "Retail Store",
+        "required_objects": [0, 24, 26, 28],  # person, backpack, handbag, suitcase
+        "optional_objects": [39, 45, 67],  # bottle, bowl, cell phone
+        "minimum_required": 2,
+        "description": "A retail environment with shoppers and merchandise"
+    },
+    "supermarket": {
+        "name": "Supermarket",
+        "required_objects": [0, 24, 39, 46, 47, 49],  # person, backpack, bottle, banana, apple, orange
+        "optional_objects": [26, 37, 45, 48, 51, 52, 53, 54, 55],  # handbag, surfboard, bowl, sandwich, carrot, hot dog, pizza, donut, cake
+        "minimum_required": 3,
+        "description": "A supermarket with food items and shoppers"
+    },
+    "classroom": {
+        "name": "Classroom",
+        "required_objects": [56, 60, 73],  # chair, dining table, book
+        "optional_objects": [63, 66, 67],  # laptop, keyboard, cell phone
+        "minimum_required": 2,
+        "description": "A classroom environment set up for educational activities"
+    },
+    "conference_room": {
+        "name": "Conference Room",
+        "required_objects": [56, 60, 63],  # chair, dining table, laptop
+        "optional_objects": [62, 67, 73],  # tv, cell phone, book
+        "minimum_required": 2,
+        "description": "A conference room designed for meetings and presentations"
+    },
+    "cafe": {
+        "name": "Cafe",
+        "required_objects": [56, 60, 41],  # chair, dining table, cup
+        "optional_objects": [39, 40, 63, 67, 73],  # bottle, wine glass, laptop, cell phone, book
+        "minimum_required": 2,
+        "description": "A cafe setting with seating and beverages"
+    },
+    "library": {
+        "name": "Library",
+        "required_objects": [56, 60, 73],  # chair, dining table, book
+        "optional_objects": [63, 67, 75],  # laptop, cell phone, vase
+        "minimum_required": 2,
+        "description": "A library with books and reading areas"
+    },
+    "gym": {
+        "name": "Gym",
+        "required_objects": [0, 32],  # person, sports ball
+        "optional_objects": [24, 25, 28, 38],  # backpack, umbrella, suitcase, tennis racket
+        "minimum_required": 1,
+        "description": "A gym or fitness area for physical activities"
+    },
+    "beach": {
+        "name": "Beach",
+        "required_objects": [0, 25, 29, 33, 37],  # person, umbrella, frisbee, kite, surfboard
+        "optional_objects": [1, 24, 26, 38],  # bicycle, backpack, handbag, tennis racket
+        "minimum_required": 2,
+        "description": "A beach area with people and recreational items"
+    },
+    "restaurant": {
+        "name": "Restaurant",
+        "required_objects": [56, 60, 41, 42, 43, 44, 45],  # chair, dining table, cup, fork, knife, spoon, bowl
+        "optional_objects": [39, 40, 48, 49, 50, 51, 52, 53, 54, 55],  # bottle, wine glass, sandwich, orange, broccoli, carrot, hot dog, pizza, donut, cake
+        "minimum_required": 3,
+        "description": "A restaurant setting for dining with tables and eating utensils"
+    },
+    "train_station": {
+        "name": "Train Station",
+        "required_objects": [0, 6],  # person, train
+        "optional_objects": [1, 2, 24, 28, 67],  # bicycle, car, backpack, suitcase, cell phone
+        "minimum_required": 1,
+        "description": "A train station with train and passengers"
+    },
+    "airport": {
+        "name": "Airport",
+        "required_objects": [0, 4, 28],  # person, airplane, suitcase
+        "optional_objects": [24, 25, 26, 67],  # backpack, umbrella, handbag, cell phone
+        "minimum_required": 2,
+        "description": "An airport with planes and travelers carrying luggage"
+    },
+      "upscale_dining": {
+        "name": "Upscale Dining Area",
+        "required_objects": [56, 60, 40, 41],  # chair, dining table, wine glass, cup
+        "optional_objects": [39, 42, 43, 44, 45, 62, 75],  # bottle, fork, knife, spoon, bowl, tv, vase
+        "minimum_required": 2,
+        "description": "An elegantly designed dining space with refined furniture and decorative elements"
+    },
+    "asian_commercial_street": {
+        "name": "Asian Commercial Street",
+        "required_objects": [0, 67],  # person, cell phone
+        "optional_objects": [1, 2, 3, 24, 25, 26, 28],  # bicycle, car, motorcycle, backpack, umbrella, handbag, suitcase
+        "minimum_required": 1,
+        "description": "A bustling commercial street with shops, signage, and pedestrians in an Asian urban setting"
+    },
+    "financial_district": {
+        "name": "Financial District",
+        "required_objects": [2, 5, 7, 9],  # car, bus, truck, traffic light
+        "optional_objects": [0, 1, 3, 8],  # person, bicycle, motorcycle, boat
+        "minimum_required": 2,
+        "description": "A major thoroughfare in a business district with high-rise buildings and traffic"
+    },
+    "urban_intersection": {
+        "name": "Urban Intersection",
+        "required_objects": [0, 9],  # person, traffic light
+        "optional_objects": [1, 2, 3, 5, 7],  # bicycle, car, motorcycle, bus, truck
+        "minimum_required": 1,
+        "description": "A busy urban crossroad with pedestrian crossings and multiple traffic flows"
+    },
+    "transit_hub": {
+        "name": "Transit Hub",
+        "required_objects": [0, 5, 6, 7],  # person, bus, train, truck
+        "optional_objects": [1, 2, 3, 9, 24, 28],  # bicycle, car, motorcycle, traffic light, backpack, suitcase
+        "minimum_required": 2,
+        "description": "A transportation center where multiple modes of transit converge"
+    },
+    "shopping_district": {
+        "name": "Shopping District",
+        "required_objects": [0, 24, 26],  # person, backpack, handbag
+        "optional_objects": [1, 2, 3, 25, 27, 28, 39, 67],  # bicycle, car, motorcycle, umbrella, tie, suitcase, bottle, cell phone
+        "minimum_required": 2,
+        "description": "A retail-focused area with shops, pedestrians, and commercial activity"
+    },
+     "bus_stop": {
+        "name": "Bus Stop",
+        "required_objects": [0, 5],  # person, bus
+        "optional_objects": [1, 2, 7, 24],  # bicycle, car, truck, backpack
+        "minimum_required": 2,
+        "description": "A roadside bus stop with waiting passengers and buses"
+    },
+    "bus_station": {
+        "name": "Bus Station",
+        "required_objects": [0, 5, 7],  # person, bus, truck
+        "optional_objects": [24, 28, 67],  # backpack, suitcase, cell phone
+        "minimum_required": 2,
+        "description": "A bus terminal with multiple buses and travelers"
+    },
+    "zoo": {
+        "name": "Zoo",
+        "required_objects": [20, 22, 23],  # elephant, zebra, giraffe
+        "optional_objects": [0, 14, 16],  # person, bird, dog
+        "minimum_required": 2,
+        "description": "A zoo environment featuring large animal exhibits and visitors"
+    },
+    "harbor": {
+        "name": "Harbor",
+        "required_objects": [8],  # boat
+        "optional_objects": [0, 2, 3, 39],  # person, car, motorcycle, bottle
+        "minimum_required": 1,
+        "description": "A harbor area with boats docked and surrounding traffic"
+    },
+    "playground": {
+        "name": "Playground",
+        "required_objects": [0, 32],  # person, sports ball
+        "optional_objects": [33, 24, 1],  # kite, backpack, bicycle
+        "minimum_required": 1,
+        "description": "An outdoor playground with people playing sports and games"
+    },
+    "sports_field": {
+        "name": "Sports Field",
+        "required_objects": [32],  # sports ball
+        "optional_objects": [38, 34, 35],  # tennis racket, baseball bat, baseball glove
+        "minimum_required": 1,
+        "description": "A sports field set up for various ball games"
+    },
+     "narrow_commercial_alley": {
+        "name": "Narrow Commercial Alley",
+        "required_objects": [0, 3],  # person, motorcycle
+        "optional_objects": [2, 7, 24, 26],  # car, truck, backpack, handbag
+        "minimum_required": 2,
+        "description": "A tight urban alley lined with shops, with pedestrians and light vehicles"
+    },
+    "daytime_shopping_street": {
+        "name": "Daytime Shopping Street",
+        "required_objects": [0, 2],  # person, car
+        "optional_objects": [1, 3, 24, 26],  # bicycle, motorcycle, backpack, handbag
+        "minimum_required": 2,
+        "description": "A busy pedestrian street during daytime, featuring shops, vehicles, and shoppers"
+    },
+    "urban_pedestrian_crossing": {
+        "name": "Urban Pedestrian Crossing",
+        "required_objects": [0, 9],  # person, traffic light
+        "optional_objects": [2, 3, 5],  # car, motorcycle, bus
+        "minimum_required": 1,
+        "description": "A city street crossing with pedestrians and traffic signals"
+    },
+    "aerial_view_intersection": {
+    "name": "Aerial View Intersection",
+    "required_objects": [0, 9],  # person, traffic light
+    "optional_objects": [1, 2, 3, 5, 7],  # bicycle, car, motorcycle, bus, truck
+    "minimum_required": 1,
+    "description": "An intersection viewed from above, showing crossing patterns and pedestrian movement"
+    },
+    "aerial_view_commercial_area": {
+        "name": "Aerial View Commercial Area",
+        "required_objects": [0, 2],  # person, car
+        "optional_objects": [1, 3, 5, 7, 24, 26],  # bicycle, motorcycle, bus, truck, backpack, handbag
+        "minimum_required": 2,
+        "description": "A commercial or shopping area viewed from above showing pedestrians and urban layout"
+    },
+    "aerial_view_plaza": {
+        "name": "Aerial View Plaza",
+        "required_objects": [0],  # person
+        "optional_objects": [1, 2, 24, 25, 26],  # bicycle, car, backpack, umbrella, handbag
+        "minimum_required": 1,
+        "description": "An urban plaza or public square viewed from above with pedestrian activity"
+    },
+    # specific cultural item
+    "asian_night_market": {
+        "name": "Asian Night Market",
+        "required_objects": [0, 67],  # person, cell phone
+        "optional_objects": [1, 3, 24, 26, 39, 41],  # bicycle, motorcycle, backpack, handbag, bottle, cup
+        "minimum_required": 1,
+        "description": "A vibrant night market scene typical in Asian cities with food stalls and crowds"
+    },
+    "asian_temple_area": {
+        "name": "Asian Temple Area",
+        "required_objects": [0],  # person
+        "optional_objects": [24, 25, 26, 67, 75],  # backpack, umbrella, handbag, cell phone, vase
+        "minimum_required": 1,
+        "description": "A traditional Asian temple complex with visitors and cultural elements"
+    },
+    "european_plaza": {
+        "name": "European Plaza",
+        "required_objects": [0],  # person
+        "optional_objects": [1, 2, 4, 9, 24, 26, 67],  # bicycle, car, airplane, traffic light, backpack, handbag, cell phone
+        "minimum_required": 1,
+        "description": "A European-style city plaza with historic architecture and pedestrian activity"
+    },
+    # specific time item
+    "nighttime_street": {
+        "name": "Nighttime Street",
+        "required_objects": [0, 9],  # person, traffic light
+        "optional_objects": [1, 2, 3, 5, 7, 67],  # bicycle, car, motorcycle, bus, truck, cell phone
+        "minimum_required": 1,
+        "description": "An urban street at night with artificial lighting and nighttime activity"
+    },
+    "nighttime_commercial_district": {
+        "name": "Nighttime Commercial District",
+        "required_objects": [0, 67],  # person, cell phone
+        "optional_objects": [1, 2, 3, 24, 26],  # bicycle, car, motorcycle, backpack, handbag
+        "minimum_required": 1,
+        "description": "A commercial district illuminated at night with neon signs and evening activity"
+    },
+    # mixture enviroment item
+    "indoor_outdoor_cafe": {
+        "name": "Indoor-Outdoor Cafe",
+        "required_objects": [56, 60, 41],  # chair, dining table, cup
+        "optional_objects": [39, 40, 63, 67, 73],  # bottle, wine glass, laptop, cell phone, book
+        "minimum_required": 2,
+        "description": "A cafe setting with both indoor elements and outdoor patio or sidewalk seating"
+    },
+    "transit_station_platform": {
+        "name": "Transit Station Platform",
+        "required_objects": [0],  # person
+        "optional_objects": [5, 6, 7, 24, 28, 67],  # bus, train, truck, backpack, suitcase, cell phone
+        "minimum_required": 1,
+        "description": "A transit platform with waiting passengers and arriving/departing vehicles"
+    },
+    "sports_stadium": {
+        "name": "Sports Stadium",
+        "required_objects": [0, 32],  # person, sports ball
+        "optional_objects": [24, 38, 39, 41, 67],  # backpack, tennis racket, bottle, cup, cell phone
+        "minimum_required": 1,
+        "description": "A sports stadium or arena with spectators and athletic activities"
+    },
+    "construction_site": {
+        "name": "Construction Site",
+        "required_objects": [0, 7],  # person, truck
+        "optional_objects": [2, 3, 11, 76, 77, 78],  # car, motorcycle, fire hydrant, scissors, teddy bear, hair drier
+        "minimum_required": 1,
+        "description": "A construction site with workers, equipment, and building materials"
+    },
+    "medical_facility": {
+        "name": "Medical Facility",
+        "required_objects": [0, 56, 60],  # person, chair, dining table
+        "optional_objects": [63, 64, 66, 67, 73],  # laptop, mouse, keyboard, cell phone, book
+        "minimum_required": 2,
+        "description": "A medical facility such as hospital, clinic or doctor's office with medical staff and patients"
+    },
+    "educational_setting": {
+        "name": "Educational Setting",
+        "required_objects": [0, 56, 60, 73],  # person, chair, dining table, book
+        "optional_objects": [63, 64, 66, 67, 74],  # laptop, mouse, keyboard, cell phone, clock
+        "minimum_required": 2,
+        "description": "An educational environment such as classroom, lecture hall or study area"
+    },
+    "aerial_view_intersection": {
+        "name": "Aerial View Intersection",
+        "required_objects": [0, 9],  # person, traffic light
+        "optional_objects": [1, 2, 3, 5, 7],  # bicycle, car, motorcycle, bus, truck
+        "minimum_required": 1,
+        "description": "An intersection viewed from above, showing crossing patterns and pedestrian movement",
+        "viewpoint_indicator": "aerial", # view side
+        "key_features": ["crosswalk_pattern", "pedestrian_flow", "intersection_layout"],  # key feature
+        "detection_priority": 10  # priority
+    },
+    "perpendicular_crosswalk_intersection": {
+        "name": "Perpendicular Crosswalk Intersection",
+        "required_objects": [0],  # person
+        "optional_objects": [1, 2, 3, 5, 7, 9],  # bicycle, car, motorcycle, bus, truck, traffic light
+        "minimum_required": 1,
+        "description": "An intersection with perpendicular crosswalks where pedestrians cross in multiple directions",
+        "viewpoint_indicator": "aerial",
+        "key_features": ["perpendicular_crosswalks", "pedestrian_crossing", "multi_directional_movement"],
+        "pattern_detection": True, # specific pattern
+        "detection_priority": 15  #
+    },
+    "beach_water_recreation": {
+    "name": "Beach/Water Recreation Area",
+    "required_objects": [0, 37],  # person, surfboard
+    "optional_objects": [25, 33, 1, 8, 29, 24, 26, 39, 41],  # umbrella, kite, bicycle, boat, frisbee, backpack, handbag, bottle, cup
+    "minimum_required": 2,
+    "description": "A beach or water recreation area with water sports equipment and beach accessories"
+    },
+    "sports_venue": {
+    "name": "Sports Venue",
+    "required_objects": [0, 32],  # person, sports ball
+    "optional_objects": [34, 35, 38, 25, 24, 26, 39, 41],  # baseball bat, baseball glove, tennis racket, umbrella, backpack, handbag, bottle, cup
+    "minimum_required": 2,
+    "description": "A professional sports venue with specialized sports equipment and spectator areas"
+    },
+    "professional_kitchen": {
+    "name": "Professional Kitchen",
+    "required_objects": [43, 44, 45],  # knife, spoon, bowl
+    "optional_objects": [42, 39, 41, 68, 69, 71, 72, 0],  # fork, bottle, cup, microwave, oven, sink, refrigerator, person
+    "minimum_required": 3,
+    "description": "A commercial kitchen with professional cooking equipment and food preparation areas"
+    },
+}

spatial_analyzer.py ADDED Viewed

	@@ -0,0 +1,1444 @@

+import os
+import numpy as np
+from typing import Dict, List, Tuple, Any, Optional
+from scene_type import SCENE_TYPES
+from enhance_descriptor import EnhancedSceneDescriber
+class SpatialAnalyzer:
+    """
+    Analyzes spatial relationships between objects in an image.
+    Handles region assignment, object positioning, and functional zone identification.
+    """
+    def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
+        """Initialize the spatial analyzer with image regions"""
+        # Define regions of the image (3x3 grid)
+        self.regions = {
+            "top_left": (0, 0, 1/3, 1/3),
+            "top_center": (1/3, 0, 2/3, 1/3),
+            "top_right": (2/3, 0, 1, 1/3),
+            "middle_left": (0, 1/3, 1/3, 2/3),
+            "middle_center": (1/3, 1/3, 2/3, 2/3),
+            "middle_right": (2/3, 1/3, 1, 2/3),
+            "bottom_left": (0, 2/3, 1/3, 1),
+            "bottom_center": (1/3, 2/3, 2/3, 1),
+            "bottom_right": (2/3, 2/3, 1, 1)
+        }
+        self.class_names = class_names
+        self.OBJECT_CATEGORIES = object_categories or {}
+        self.enhance_descriptor = EnhancedSceneDescriber(scene_types=SCENE_TYPES)
+        # Distances thresholds for proximity analysis (normalized)
+        self.proximity_threshold = 0.2
+    def _determine_region(self, x: float, y: float) -> str:
+        """
+        Determine which region a point falls into.
+        Args:
+            x: Normalized x-coordinate (0-1)
+            y: Normalized y-coordinate (0-1)
+        Returns:
+            Region name
+        """
+        for region_name, (x1, y1, x2, y2) in self.regions.items():
+            if x1 <= x < x2 and y1 <= y < y2:
+                return region_name
+        return "unknown"
+    def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
+        """
+        Analyze object distribution across image regions.
+        Args:
+            detected_objects: List of detected objects with position information
+        Returns:
+            Dictionary with region analysis
+        """
+        # Count objects in each region
+        region_counts = {region: 0 for region in self.regions.keys()}
+        region_objects = {region: [] for region in self.regions.keys()}
+        for obj in detected_objects:
+            region = obj["region"]
+            if region in region_counts:
+                region_counts[region] += 1
+                region_objects[region].append({
+                    "class_id": obj["class_id"],
+                    "class_name": obj["class_name"]
+                })
+        # Determine main focus regions (top 1-2 regions by object count)
+        sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
+        main_regions = [region for region, count in sorted_regions if count > 0][:2]
+        return {
+            "counts": region_counts,
+            "main_focus": main_regions,
+            "objects_by_region": region_objects
+        }
+    def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
+        """
+        Extract detected objects from detection result with position information.
+        Args:
+            detection_result: Detection result from YOLOv8
+            confidence_threshold: Minimum confidence threshold
+        Returns:
+            List of dictionaries with detected object information
+        """
+        boxes = detection_result.boxes.xyxy.cpu().numpy()
+        classes = detection_result.boxes.cls.cpu().numpy().astype(int)
+        confidences = detection_result.boxes.conf.cpu().numpy()
+        # Image dimensions
+        img_height, img_width = detection_result.orig_shape[:2]
+        detected_objects = []
+        for box, class_id, confidence in zip(boxes, classes, confidences):
+            # Skip objects with confidence below threshold
+            if confidence < confidence_threshold:
+                continue
+            x1, y1, x2, y2 = box
+            width = x2 - x1
+            height = y2 - y1
+            # Center point
+            center_x = (x1 + x2) / 2
+            center_y = (y1 + y2) / 2
+            # Normalized positions (0-1)
+            norm_x = center_x / img_width
+            norm_y = center_y / img_height
+            norm_width = width / img_width
+            norm_height = height / img_height
+            # Area calculation
+            area = width * height
+            norm_area = area / (img_width * img_height)
+            # Region determination
+            object_region = self._determine_region(norm_x, norm_y)
+            detected_objects.append({
+                "class_id": int(class_id),
+                "class_name": self.class_names[int(class_id)],
+                "confidence": float(confidence),
+                "box": [float(x1), float(y1), float(x2), float(y2)],
+                "center": [float(center_x), float(center_y)],
+                "normalized_center": [float(norm_x), float(norm_y)],
+                "size": [float(width), float(height)],
+                "normalized_size": [float(norm_width), float(norm_height)],
+                "area": float(area),
+                "normalized_area": float(norm_area),
+                "region": object_region
+            })
+        return detected_objects
+    def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
+        """
+        檢測場景視角並識別特殊場景模式。
+        Args:
+            detected_objects: 檢測到的物體列表
+        Returns:
+            Dict: 包含視角和場景模式信息的字典
+        """
+        if not detected_objects:
+            return {"viewpoint": "eye_level", "patterns": []}
+        # 從物體位置中提取信息
+        patterns = []
+        # 檢測行人位置模式
+        pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+        # 檢查是否有足夠的行人來識別模式
+        if len(pedestrian_objs) >= 4:
+            pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
+            # 檢測十字交叉模式
+            if self._detect_cross_pattern(pedestrian_positions):
+                patterns.append("crosswalk_intersection")
+            # 檢測多方向行人流
+            directions = self._analyze_movement_directions(pedestrian_positions)
+            if len(directions) >= 2:
+                patterns.append("multi_directional_movement")
+        # 檢查物體的大小一致性 - 在空中俯視圖中，物體大小通常更一致
+        if len(detected_objects) >= 5:
+            sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
+            size_variance = np.var(sizes) / (np.mean(sizes) ** 2)  # 標準化變異數，不會受到平均值影響
+            if size_variance < 0.3:  # 低變異表示大小一致
+                patterns.append("consistent_object_size")
+        # 基本視角檢測
+        viewpoint = self.enhance_descriptor._detect_viewpoint(detected_objects)
+        # 根據檢測到的模式增強視角判斷
+        if "crosswalk_intersection" in patterns and viewpoint != "aerial":
+            # 如果檢測到斑馬線交叉但視角判斷不是空中視角，優先採用模式判斷
+            viewpoint = "aerial"
+        return {
+            "viewpoint": viewpoint,
+            "patterns": patterns
+        }
+    def _detect_cross_pattern(self, positions):
+        """
+        檢測位置中的十字交叉模式
+        Args:
+            positions: 位置列表 [[x1, y1], [x2, y2], ...]
+        Returns:
+            bool: 是否檢測到十字交叉模式
+        """
+        if len(positions) < 8:  # 需要足夠多的點
+            return False
+        # 提取 x 和 y 坐標
+        x_coords = [pos[0] for pos in positions]
+        y_coords = [pos[1] for pos in positions]
+        # 檢測 x 和 y 方向的聚類
+        x_clusters = []
+        y_clusters = []
+        # 簡化的聚類分析
+        x_mean = np.mean(x_coords)
+        y_mean = np.mean(y_coords)
+        # 計算在中心線附近的點
+        near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1)
+        near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1)
+        # 如果有足夠的點在中心線附近，可能是十字交叉
+        return near_x_center >= 3 and near_y_center >= 3
+    def _analyze_movement_directions(self, positions):
+        """
+        分析位置中的移動方向
+        Args:
+            positions: 位置列表 [[x1, y1], [x2, y2], ...]
+        Returns:
+            list: 檢測到的主要方向
+        """
+        if len(positions) < 6:
+            return []
+        # extract x 和 y 坐標
+        x_coords = [pos[0] for pos in positions]
+        y_coords = [pos[1] for pos in positions]
+        directions = []
+        # horizontal move (left --> right)
+        x_std = np.std(x_coords)
+        x_range = max(x_coords) - min(x_coords)
+        # vertical move(up --> down)
+        y_std = np.std(y_coords)
+        y_range = max(y_coords) - min(y_coords)
+        # 足夠大的範圍表示該方向有運動
+        if x_range > 0.4:
+            directions.append("horizontal")
+        if y_range > 0.4:
+            directions.append("vertical")
+        return directions
+    def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        Identify functional zones within the scene with improved detection for different viewpoints
+        and cultural contexts.
+        Args:
+            detected_objects: List of detected objects
+            scene_type: Identified scene type
+        Returns:
+            Dictionary of functional zones with their descriptions
+        """
+        # Group objects by category and region
+        category_regions = {}
+        for obj in detected_objects:
+            # Find object category
+            category = "other"
+            for cat_name, cat_ids in self.OBJECT_CATEGORIES.items():
+                if obj["class_id"] in cat_ids:
+                    category = cat_name
+                    break
+            # Add to category-region mapping
+            if category not in category_regions:
+                category_regions[category] = {}
+            region = obj["region"]
+            if region not in category_regions[category]:
+                category_regions[category][region] = []
+            category_regions[category][region].append(obj)
+        # Identify zones based on object groupings
+        zones = {}
+        # Detect viewpoint to adjust zone identification strategy
+        viewpoint = self._detect_scene_viewpoint(detected_objects)
+        # Choose appropriate zone identification strategy based on scene type and viewpoint
+        if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
+            # Indoor scenes
+            zones.update(self._identify_indoor_zones(category_regions, detected_objects, scene_type))
+        elif scene_type in ["city_street", "parking_lot", "park_area"]:
+            # Outdoor general scenes
+            zones.update(self._identify_outdoor_general_zones(category_regions, detected_objects, scene_type))
+        elif "aerial" in scene_type or viewpoint == "aerial":
+            # Aerial viewpoint scenes
+            zones.update(self._identify_aerial_view_zones(category_regions, detected_objects, scene_type))
+        elif "asian" in scene_type:
+            # Asian cultural context scenes
+            zones.update(self._identify_asian_cultural_zones(category_regions, detected_objects, scene_type))
+        elif scene_type == "urban_intersection":
+            # Specific urban intersection logic
+            zones.update(self._identify_intersection_zones(category_regions, detected_objects, viewpoint))
+        elif scene_type == "financial_district":
+            # Financial district specific logic
+            zones.update(self._identify_financial_district_zones(category_regions, detected_objects))
+        elif scene_type == "upscale_dining":
+            # Upscale dining specific logic
+            zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
+        else:
+            # Default zone identification for other scene types
+            zones.update(self._identify_default_zones(category_regions, detected_objects))
+        # If no zones were identified, try the default approach
+        if not zones:
+            zones.update(self._identify_default_zones(category_regions, detected_objects))
+        return zones
+    def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        Identify functional zones for indoor scenes.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+            scene_type: Specific indoor scene type
+        Returns:
+            Dict: Indoor functional zones
+        """
+        zones = {}
+        # Seating/social zone
+        if "furniture" in category_regions:
+            furniture_regions = category_regions["furniture"]
+            main_furniture_region = max(furniture_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+            if main_furniture_region[0] is not None and len(main_furniture_region[1]) >= 2:
+                zone_objects = [obj["class_name"] for obj in main_furniture_region[1]]
+                zones["social_zone"] = {
+                    "region": main_furniture_region[0],
+                    "objects": zone_objects,
+                    "description": f"Social or seating area with {', '.join(zone_objects)}"
+                }
+        # Entertainment zone
+        if "electronics" in category_regions:
+            electronics_items = []
+            for region_objects in category_regions["electronics"].values():
+                electronics_items.extend([obj["class_name"] for obj in region_objects])
+            if electronics_items:
+                zones["entertainment_zone"] = {
+                    "region": self._find_main_region(category_regions.get("electronics", {})),
+                    "objects": electronics_items,
+                    "description": f"Entertainment or media area with {', '.join(electronics_items)}"
+                }
+        # Dining/food zone
+        food_zone_categories = ["kitchen_items", "food"]
+        food_items = []
+        food_regions = {}
+        for category in food_zone_categories:
+            if category in category_regions:
+                for region, objects in category_regions[category].items():
+                    if region not in food_regions:
+                        food_regions[region] = []
+                    food_regions[region].extend(objects)
+                    food_items.extend([obj["class_name"] for obj in objects])
+        if food_items:
+            main_food_region = max(food_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+            if main_food_region[0] is not None:
+                zones["dining_zone"] = {
+                    "region": main_food_region[0],
+                    "objects": list(set(food_items)),
+                    "description": f"Dining or food preparation area with {', '.join(list(set(food_items))[:3])}"
+                }
+        # Work/study zone - enhanced to detect even when scene_type is not explicitly office
+        work_items = []
+        work_regions = {}
+        for obj in detected_objects:
+            if obj["class_id"] in [56, 60, 63, 64, 66, 73]:  # chair, table, laptop, mouse, keyboard, book
+                region = obj["region"]
+                if region not in work_regions:
+                    work_regions[region] = []
+                work_regions[region].append(obj)
+                work_items.append(obj["class_name"])
+        # Check for laptop and table/chair combinations that suggest a workspace
+        has_laptop = any(obj["class_id"] == 63 for obj in detected_objects)
+        has_keyboard = any(obj["class_id"] == 66 for obj in detected_objects)
+        has_table = any(obj["class_id"] == 60 for obj in detected_objects)
+        has_chair = any(obj["class_id"] == 56 for obj in detected_objects)
+        # If we have electronics with furniture in the same region, likely a workspace
+        workspace_detected = (has_laptop or has_keyboard) and (has_table or has_chair)
+        if (workspace_detected or scene_type in ["office_workspace", "meeting_room"]) and work_items:
+            main_work_region = max(work_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+            if main_work_region[0] is not None:
+                zones["workspace_zone"] = {
+                    "region": main_work_region[0],
+                    "objects": list(set(work_items)),
+                    "description": f"Work or study area with {', '.join(list(set(work_items))[:3])}"
+                }
+        # Bedroom-specific zones
+        if scene_type == "bedroom":
+            bed_objects = [obj for obj in detected_objects if obj["class_id"] == 59]  # Bed
+            if bed_objects:
+                bed_region = bed_objects[0]["region"]
+                zones["sleeping_zone"] = {
+                    "region": bed_region,
+                    "objects": ["bed"],
+                    "description": "Sleeping area with bed"
+                }
+        # Kitchen-specific zones
+        if scene_type == "kitchen":
+            # Look for appliances (refrigerator, oven, microwave, sink)
+            appliance_ids = [68, 69, 71, 72]  # microwave, oven, sink, refrigerator
+            appliance_objects = [obj for obj in detected_objects if obj["class_id"] in appliance_ids]
+            if appliance_objects:
+                appliance_regions = {}
+                for obj in appliance_objects:
+                    region = obj["region"]
+                    if region not in appliance_regions:
+                        appliance_regions[region] = []
+                    appliance_regions[region].append(obj)
+                if appliance_regions:
+                    main_appliance_region = max(appliance_regions.items(),
+                                            key=lambda x: len(x[1]),
+                                            default=(None, []))
+                    if main_appliance_region[0] is not None:
+                        appliance_names = [obj["class_name"] for obj in main_appliance_region[1]]
+                        zones["kitchen_appliance_zone"] = {
+                            "region": main_appliance_region[0],
+                            "objects": appliance_names,
+                            "description": f"Kitchen appliance area with {', '.join(appliance_names)}"
+                        }
+        return zones
+    def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
+        """
+        Identify functional zones for urban intersections with enhanced spatial awareness.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+            viewpoint: Detected viewpoint
+        Returns:
+            Dict: Refined intersection functional zones
+        """
+        zones = {}
+        # Get pedestrians, vehicles and traffic signals
+        pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]]  # bicycle, car, motorcycle, bus, truck
+        traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
+        # Create distribution maps for better spatial understanding
+        regions_distribution = self._create_distribution_map(detected_objects)
+        # Analyze pedestrian crossing patterns
+        crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs, regions_distribution)
+        zones.update(crossing_zones)
+        # Analyze vehicle traffic zones with directional awareness
+        traffic_zones = self._analyze_traffic_zones(vehicle_objs, regions_distribution)
+        zones.update(traffic_zones)
+        # Identify traffic control zones based on signal placement
+        if traffic_light_objs:
+            # Group traffic lights by region for better organization
+            signal_regions = {}
+            for obj in traffic_light_objs:
+                region = obj["region"]
+                if region not in signal_regions:
+                    signal_regions[region] = []
+                signal_regions[region].append(obj)
+            # Create traffic control zones for each region with signals
+            for idx, (region, signals) in enumerate(signal_regions.items()):
+                # Check if this region has a directional name
+                direction = self._get_directional_description(region)
+                zones[f"traffic_control_zone_{idx+1}"] = {
+                    "region": region,
+                    "objects": ["traffic light"] * len(signals),
+                    "description": f"Traffic control area with {len(signals)} traffic signals" +
+                                (f" in {direction} area" if direction else "")
+                }
+        return zones
+    def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
+                                region_distribution: Dict) -> Dict:
+        """
+        Analyze pedestrian crossing patterns to identify crosswalk zones.
+        Args:
+            pedestrians: List of pedestrian objects
+            traffic_lights: List of traffic light objects
+            region_distribution: Distribution of objects by region
+        Returns:
+            Dict: Identified crossing zones
+        """
+        crossing_zones = {}
+        if not pedestrians:
+            return crossing_zones
+        # Group pedestrians by region
+        pedestrian_regions = {}
+        for p in pedestrians:
+            region = p["region"]
+            if region not in pedestrian_regions:
+                pedestrian_regions[region] = []
+            pedestrian_regions[region].append(p)
+        # Sort regions by pedestrian count to find main crossing areas
+        sorted_regions = sorted(pedestrian_regions.items(), key=lambda x: len(x[1]), reverse=True)
+        # Create crossing zones for regions with pedestrians
+        for idx, (region, peds) in enumerate(sorted_regions[:2]):  # Focus on top 2 regions
+            # Check if there are traffic lights nearby to indicate a crosswalk
+            has_nearby_signals = any(t["region"] == region for t in traffic_lights)
+            # Create crossing zone with descriptive naming
+            zone_name = f"crossing_zone_{idx+1}"
+            direction = self._get_directional_description(region)
+            description = f"Pedestrian crossing area with {len(peds)} "
+            description += "person" if len(peds) == 1 else "people"
+            if direction:
+                description += f" in {direction} direction"
+            if has_nearby_signals:
+                description += " near traffic signals"
+            crossing_zones[zone_name] = {
+                "region": region,
+                "objects": ["pedestrian"] * len(peds),
+                "description": description
+            }
+        return crossing_zones
+    def _analyze_traffic_zones(self, vehicles: List[Dict], region_distribution: Dict) -> Dict:
+        """
+        Analyze vehicle distribution to identify traffic zones with directional awareness.
+        Args:
+            vehicles: List of vehicle objects
+            region_distribution: Distribution of objects by region
+        Returns:
+            Dict: Identified traffic zones
+        """
+        traffic_zones = {}
+        if not vehicles:
+            return traffic_zones
+        # Group vehicles by region
+        vehicle_regions = {}
+        for v in vehicles:
+            region = v["region"]
+            if region not in vehicle_regions:
+                vehicle_regions[region] = []
+            vehicle_regions[region].append(v)
+        # Create traffic zones for regions with vehicles
+        main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))
+        if main_traffic_region[0] is not None:
+            region = main_traffic_region[0]
+            vehicles_in_region = main_traffic_region[1]
+            # Get a list of vehicle types for description
+            vehicle_types = [v["class_name"] for v in vehicles_in_region]
+            unique_types = list(set(vehicle_types))
+            # Get directional description
+            direction = self._get_directional_description(region)
+            # Create descriptive zone
+            traffic_zones["vehicle_zone"] = {
+                "region": region,
+                "objects": vehicle_types,
+                "description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
+                            (f" in {direction} area" if direction else "")
+            }
+            # If vehicles are distributed across multiple regions, create secondary zones
+            if len(vehicle_regions) > 1:
+                # Get second most populated region
+                sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
+                if len(sorted_regions) > 1:
+                    second_region, second_vehicles = sorted_regions[1]
+                    direction = self._get_directional_description(second_region)
+                    vehicle_types = [v["class_name"] for v in second_vehicles]
+                    unique_types = list(set(vehicle_types))
+                    traffic_zones["secondary_vehicle_zone"] = {
+                        "region": second_region,
+                        "objects": vehicle_types,
+                        "description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
+                                    (f" in {direction} direction" if direction else "")
+                    }
+        return traffic_zones
+    def _get_directional_description(self, region: str) -> str:
+        """
+        Convert region name to a directional description.
+        Args:
+            region: Region name from the grid
+        Returns:
+            str: Directional description
+        """
+        if "top" in region and "left" in region:
+            return "northwest"
+        elif "top" in region and "right" in region:
+            return "northeast"
+        elif "bottom" in region and "left" in region:
+            return "southwest"
+        elif "bottom" in region and "right" in region:
+            return "southeast"
+        elif "top" in region:
+            return "north"
+        elif "bottom" in region:
+            return "south"
+        elif "left" in region:
+            return "west"
+        elif "right" in region:
+            return "east"
+        else:
+            return "central"
+    def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
+        """
+        Create a distribution map of objects across regions for spatial analysis.
+        Args:
+            detected_objects: List of detected objects
+        Returns:
+            Dict: Distribution map of objects by region and class
+        """
+        distribution = {}
+        # Initialize all regions
+        for region in self.regions.keys():
+            distribution[region] = {
+                "total": 0,
+                "objects": {},
+                "density": 0
+            }
+        # Populate the distribution
+        for obj in detected_objects:
+            region = obj["region"]
+            class_id = obj["class_id"]
+            class_name = obj["class_name"]
+            distribution[region]["total"] += 1
+            if class_id not in distribution[region]["objects"]:
+                distribution[region]["objects"][class_id] = {
+                    "name": class_name,
+                    "count": 0,
+                    "positions": []
+                }
+            distribution[region]["objects"][class_id]["count"] += 1
+            # Store position for spatial relationship analysis
+            if "normalized_center" in obj:
+                distribution[region]["objects"][class_id]["positions"].append(obj["normalized_center"])
+        # Calculate object density for each region
+        for region, data in distribution.items():
+            # Assuming all regions are equal size in the grid
+            data["density"] = data["total"] / 1
+        return distribution
+    def _identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        Identify functional zones for scenes with Asian cultural context.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+            scene_type: Specific scene type
+        Returns:
+            Dict: Asian cultural functional zones
+        """
+        zones = {}
+        # Identify storefront zone
+        storefront_items = []
+        storefront_regions = {}
+        # Since storefronts aren't directly detectable, infer from context
+        # For example, look for regions with signs, people, and smaller objects
+        sign_regions = set()
+        for obj in detected_objects:
+            if obj["class_id"] == 0:  # Person
+                region = obj["region"]
+                if region not in storefront_regions:
+                    storefront_regions[region] = []
+                storefront_regions[region].append(obj)
+                # Add regions with people as potential storefront areas
+                sign_regions.add(region)
+        # Use the areas with most people as storefront zones
+        if storefront_regions:
+            main_storefront_regions = sorted(storefront_regions.items(),
+                                        key=lambda x: len(x[1]),
+                                        reverse=True)[:2]  # Top 2 regions
+            for idx, (region, objs) in enumerate(main_storefront_regions):
+                zones[f"commercial_zone_{idx+1}"] = {
+                    "region": region,
+                    "objects": [obj["class_name"] for obj in objs],
+                    "description": f"Asian commercial storefront with pedestrian activity"
+                }
+        # Identify pedestrian pathway - enhanced to better detect linear pathways
+        pathway_items = []
+        pathway_regions = {}
+        # Extract people for pathway analysis
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+        # Analyze if people form a line (typical of shopping streets)
+        people_positions = [obj["normalized_center"] for obj in people_objs]
+        structured_path = False
+        if len(people_positions) >= 3:
+            # Check if people are arranged along a similar y-coordinate (horizontal path)
+            y_coords = [pos[1] for pos in people_positions]
+            y_mean = sum(y_coords) / len(y_coords)
+            y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
+            horizontal_path = y_variance < 0.05  # Low variance indicates horizontal alignment
+            # Check if people are arranged along a similar x-coordinate (vertical path)
+            x_coords = [pos[0] for pos in people_positions]
+            x_mean = sum(x_coords) / len(x_coords)
+            x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)
+            vertical_path = x_variance < 0.05  # Low variance indicates vertical alignment
+            structured_path = horizontal_path or vertical_path
+            path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"
+        # Collect pathway objects (people, bicycles, motorcycles in middle area)
+        for obj in detected_objects:
+            if obj["class_id"] in [0, 1, 3]:  # Person, bicycle, motorcycle
+                y_pos = obj["normalized_center"][1]
+                # Group by vertical position (middle of image likely pathway)
+                if 0.25 <= y_pos <= 0.75:
+                    region = obj["region"]
+                    if region not in pathway_regions:
+                        pathway_regions[region] = []
+                    pathway_regions[region].append(obj)
+                    pathway_items.append(obj["class_name"])
+        if pathway_items:
+            path_desc = "Pedestrian walkway with people moving through the commercial area"
+            if structured_path:
+                path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"
+            zones["pedestrian_pathway"] = {
+                "region": "middle_center",  # Assumption: pathway often in middle
+                "objects": list(set(pathway_items)),
+                "description": path_desc
+            }
+        # Identify vendor zone (small stalls/shops - inferred from context)
+        has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects)  # bags, bottles, cups
+        has_people = any(obj["class_id"] == 0 for obj in detected_objects)
+        if has_small_objects and has_people:
+            # Likely vendor areas are where people and small objects cluster
+            small_obj_regions = {}
+            for obj in detected_objects:
+                if obj["class_id"] in [24, 26, 39, 41, 67]:  # bags, bottles, cups, phones
+                    region = obj["region"]
+                    if region not in small_obj_regions:
+                        small_obj_regions[region] = []
+                    small_obj_regions[region].append(obj)
+            if small_obj_regions:
+                main_vendor_region = max(small_obj_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_vendor_region[0] is not None:
+                    vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
+                    zones["vendor_zone"] = {
+                        "region": main_vendor_region[0],
+                        "objects": list(set(vendor_items)),
+                        "description": "Vendor or market stall area with small merchandise"
+                    }
+        # For night markets, identify illuminated zones
+        if scene_type == "asian_night_market":
+            # Night markets typically have bright spots for food stalls
+            # This would be enhanced with lighting analysis integration
+            zones["food_stall_zone"] = {
+                "region": "middle_center",
+                "objects": ["inferred food stalls"],
+                "description": "Food stall area typical of Asian night markets"
+            }
+        return zones
+    def _identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
+        """
+        Identify functional zones for upscale dining settings.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+        Returns:
+            Dict: Upscale dining functional zones
+        """
+        zones = {}
+        # Identify dining table zone
+        dining_items = []
+        dining_regions = {}
+        for obj in detected_objects:
+            if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]:  # Wine glass, cup, fork, knife, spoon, bowl, table
+                region = obj["region"]
+                if region not in dining_regions:
+                    dining_regions[region] = []
+                dining_regions[region].append(obj)
+                dining_items.append(obj["class_name"])
+        if dining_items:
+            main_dining_region = max(dining_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+            if main_dining_region[0] is not None:
+                zones["formal_dining_zone"] = {
+                    "region": main_dining_region[0],
+                    "objects": list(set(dining_items)),
+                    "description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
+                }
+        # Identify decorative zone with enhanced detection
+        decor_items = []
+        decor_regions = {}
+        # Look for decorative elements (vases, wine glasses, unused dishes)
+        for obj in detected_objects:
+            if obj["class_id"] in [75, 40]:  # Vase, wine glass
+                region = obj["region"]
+                if region not in decor_regions:
+                    decor_regions[region] = []
+                decor_regions[region].append(obj)
+                decor_items.append(obj["class_name"])
+        if decor_items:
+            main_decor_region = max(decor_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+            if main_decor_region[0] is not None:
+                zones["decorative_zone"] = {
+                    "region": main_decor_region[0],
+                    "objects": list(set(decor_items)),
+                    "description": f"Decorative area with {', '.join(list(set(decor_items)))}"
+                }
+        # Identify seating arrangement zone
+        chairs = [obj for obj in detected_objects if obj["class_id"] == 56]  # chairs
+        if len(chairs) >= 2:
+            chair_regions = {}
+            for obj in chairs:
+                region = obj["region"]
+                if region not in chair_regions:
+                    chair_regions[region] = []
+                chair_regions[region].append(obj)
+            if chair_regions:
+                main_seating_region = max(chair_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_seating_region[0] is not None:
+                    zones["dining_seating_zone"] = {
+                        "region": main_seating_region[0],
+                        "objects": ["chair"] * len(main_seating_region[1]),
+                        "description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
+                    }
+        # Identify serving area (if different from dining area)
+        serving_items = []
+        serving_regions = {}
+        # Serving areas might have bottles, bowls, containers
+        for obj in detected_objects:
+            if obj["class_id"] in [39, 45]:  # Bottle, bowl
+                # Check if it's in a different region from the main dining table
+                if "formal_dining_zone" in zones and obj["region"] != zones["formal_dining_zone"]["region"]:
+                    region = obj["region"]
+                    if region not in serving_regions:
+                        serving_regions[region] = []
+                    serving_regions[region].append(obj)
+                    serving_items.append(obj["class_name"])
+        if serving_items:
+            main_serving_region = max(serving_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+            if main_serving_region[0] is not None:
+                zones["serving_zone"] = {
+                    "region": main_serving_region[0],
+                    "objects": list(set(serving_items)),
+                    "description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
+                }
+        return zones
+    def _identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
+        """
+        Identify functional zones for financial district scenes.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+        Returns:
+            Dict: Financial district functional zones
+        """
+        zones = {}
+        # Identify traffic zone
+        traffic_items = []
+        traffic_regions = {}
+        for obj in detected_objects:
+            if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]:  # Various vehicles and traffic lights
+                region = obj["region"]
+                if region not in traffic_regions:
+                    traffic_regions[region] = []
+                traffic_regions[region].append(obj)
+                traffic_items.append(obj["class_name"])
+        if traffic_items:
+            main_traffic_region = max(traffic_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+            if main_traffic_region[0] is not None:
+                zones["traffic_zone"] = {
+                    "region": main_traffic_region[0],
+                    "objects": list(set(traffic_items)),
+                    "description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
+                }
+        # Building zones on the sides (inferred from scene context)
+        # Enhanced to check if there are actual regions that might contain buildings
+        # Check for regions without vehicles or pedestrians - likely building areas
+        left_side_regions = ["top_left", "middle_left", "bottom_left"]
+        right_side_regions = ["top_right", "middle_right", "bottom_right"]
+        # Check left side
+        left_building_evidence = True
+        for region in left_side_regions:
+            # If many vehicles or people in this region, less likely to be buildings
+            vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
+                                for obj in detected_objects)
+            people_in_region = any(obj["region"] == region and obj["class_id"] == 0
+                                for obj in detected_objects)
+            if vehicle_in_region or people_in_region:
+                left_building_evidence = False
+                break
+        # Check right side
+        right_building_evidence = True
+        for region in right_side_regions:
+            # If many vehicles or people in this region, less likely to be buildings
+            vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
+                                for obj in detected_objects)
+            people_in_region = any(obj["region"] == region and obj["class_id"] == 0
+                                for obj in detected_objects)
+            if vehicle_in_region or people_in_region:
+                right_building_evidence = False
+                break
+        # Add building zones if evidence supports them
+        if left_building_evidence:
+            zones["building_zone_left"] = {
+                "region": "middle_left",
+                "objects": ["building"],  # Inferred
+                "description": "Tall buildings line the left side of the street"
+            }
+        if right_building_evidence:
+            zones["building_zone_right"] = {
+                "region": "middle_right",
+                "objects": ["building"],  # Inferred
+                "description": "Tall buildings line the right side of the street"
+            }
+        # Identify pedestrian zone if people are present
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+        if people_objs:
+            people_regions = {}
+            for obj in people_objs:
+                region = obj["region"]
+                if region not in people_regions:
+                    people_regions[region] = []
+                people_regions[region].append(obj)
+            if people_regions:
+                main_pedestrian_region = max(people_regions.items(),
+                                        key=lambda x: len(x[1]),
+                                        default=(None, []))
+                if main_pedestrian_region[0] is not None:
+                    zones["pedestrian_zone"] = {
+                        "region": main_pedestrian_region[0],
+                        "objects": ["person"] * len(main_pedestrian_region[1]),
+                        "description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
+                    }
+        return zones
+    def _identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        Identify functional zones for scenes viewed from an aerial perspective.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+            scene_type: Specific scene type
+        Returns:
+            Dict: Aerial view functional zones
+        """
+        zones = {}
+        # For aerial views, we focus on patterns and flows rather than specific zones
+        # Identify pedestrian patterns
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+        if people_objs:
+            # Convert positions to arrays for pattern analysis
+            positions = np.array([obj["normalized_center"] for obj in people_objs])
+            if len(positions) >= 3:
+                # Calculate distribution metrics
+                x_coords = positions[:, 0]
+                y_coords = positions[:, 1]
+                x_mean = np.mean(x_coords)
+                y_mean = np.mean(y_coords)
+                x_std = np.std(x_coords)
+                y_std = np.std(y_coords)
+                # Determine if people are organized in a linear pattern
+                if x_std < 0.1 or y_std < 0.1:
+                    # Linear distribution along one axis
+                    pattern_direction = "vertical" if x_std < y_std else "horizontal"
+                    zones["pedestrian_pattern"] = {
+                        "region": "central",
+                        "objects": ["person"] * len(people_objs),
+                        "description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
+                    }
+                else:
+                    # More dispersed pattern
+                    zones["pedestrian_distribution"] = {
+                        "region": "wide",
+                        "objects": ["person"] * len(people_objs),
+                        "description": f"Aerial view shows pedestrians distributed across the area"
+                    }
+        # Identify vehicle patterns for traffic analysis
+        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
+        if vehicle_objs:
+            # Convert positions to arrays for pattern analysis
+            positions = np.array([obj["normalized_center"] for obj in vehicle_objs])
+            if len(positions) >= 2:
+                # Calculate distribution metrics
+                x_coords = positions[:, 0]
+                y_coords = positions[:, 1]
+                x_mean = np.mean(x_coords)
+                y_mean = np.mean(y_coords)
+                x_std = np.std(x_coords)
+                y_std = np.std(y_coords)
+                # Determine if vehicles are organized in lanes
+                if x_std < y_std * 0.5:
+                    # Vehicles aligned vertically - indicates north-south traffic
+                    zones["vertical_traffic_flow"] = {
+                        "region": "central_vertical",
+                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
+                        "description": "North-south traffic flow visible from aerial view"
+                    }
+                elif y_std < x_std * 0.5:
+                    # Vehicles aligned horizontally - indicates east-west traffic
+                    zones["horizontal_traffic_flow"] = {
+                        "region": "central_horizontal",
+                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
+                        "description": "East-west traffic flow visible from aerial view"
+                    }
+                else:
+                    # Vehicles in multiple directions - indicates intersection
+                    zones["intersection_traffic"] = {
+                        "region": "central",
+                        "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
+                        "description": "Multi-directional traffic at intersection visible from aerial view"
+                    }
+        # For intersection specific aerial views, identify crossing patterns
+        if "intersection" in scene_type:
+            # Check for traffic signals
+            traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
+            if traffic_light_objs:
+                zones["traffic_control_pattern"] = {
+                    "region": "intersection",
+                    "objects": ["traffic light"] * len(traffic_light_objs),
+                    "description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
+                }
+            # Crosswalks are inferred from context in aerial views
+            zones["crossing_pattern"] = {
+                "region": "central",
+                "objects": ["inferred crosswalk"],
+                "description": "Crossing pattern visible from aerial perspective"
+            }
+        # For plaza aerial views, identify gathering patterns
+        if "plaza" in scene_type:
+            # Plazas typically have central open area with people
+            if people_objs:
+                # Check if people are clustered in central region
+                central_people = [obj for obj in people_objs
+                                if "middle" in obj["region"]]
+                if central_people:
+                    zones["central_gathering"] = {
+                        "region": "middle_center",
+                        "objects": ["person"] * len(central_people),
+                        "description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
+                    }
+        return zones
+    def _identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
+        """
+        Identify functional zones for general outdoor scenes.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+            scene_type: Specific outdoor scene type
+        Returns:
+            Dict: Outdoor functional zones
+        """
+        zones = {}
+        # Identify pedestrian zones
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+        if people_objs:
+            people_regions = {}
+            for obj in people_objs:
+                region = obj["region"]
+                if region not in people_regions:
+                    people_regions[region] = []
+                people_regions[region].append(obj)
+            if people_regions:
+                # Find main pedestrian areas
+                main_people_regions = sorted(people_regions.items(),
+                                        key=lambda x: len(x[1]),
+                                        reverse=True)[:2]  # Top 2 regions
+                for idx, (region, objs) in enumerate(main_people_regions):
+                    if len(objs) > 0:
+                        zones[f"pedestrian_zone_{idx+1}"] = {
+                            "region": region,
+                            "objects": ["person"] * len(objs),
+                            "description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
+                        }
+        # Identify vehicle zones for streets and parking lots
+        vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
+        if vehicle_objs:
+            vehicle_regions = {}
+            for obj in vehicle_objs:
+                region = obj["region"]
+                if region not in vehicle_regions:
+                    vehicle_regions[region] = []
+                vehicle_regions[region].append(obj)
+            if vehicle_regions:
+                main_vehicle_region = max(vehicle_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_vehicle_region[0] is not None:
+                    vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
+                    zones["vehicle_zone"] = {
+                        "region": main_vehicle_region[0],
+                        "objects": vehicle_types,
+                        "description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
+                    }
+        # For park areas, identify recreational zones
+        if scene_type == "park_area":
+            # Look for recreational objects (sports balls, kites, etc.)
+            rec_items = []
+            rec_regions = {}
+            for obj in detected_objects:
+                if obj["class_id"] in [32, 33, 34, 35, 38]:  # sports ball, kite, baseball bat, glove, tennis racket
+                    region = obj["region"]
+                    if region not in rec_regions:
+                        rec_regions[region] = []
+                    rec_regions[region].append(obj)
+                    rec_items.append(obj["class_name"])
+            if rec_items:
+                main_rec_region = max(rec_regions.items(),
+                                key=lambda x: len(x[1]),
+                                default=(None, []))
+                if main_rec_region[0] is not None:
+                    zones["recreational_zone"] = {
+                        "region": main_rec_region[0],
+                        "objects": list(set(rec_items)),
+                        "description": f"Recreational area with {', '.join(list(set(rec_items)))}"
+                    }
+        # For parking lots, identify parking zones
+        if scene_type == "parking_lot":
+            # Look for parked cars with consistent spacing
+            car_objs = [obj for obj in detected_objects if obj["class_id"] == 2]  # cars
+            if len(car_objs) >= 3:
+                # Check if cars are arranged in patterns (simplified)
+                car_positions = [obj["normalized_center"] for obj in car_objs]
+                # Check for row patterns by analyzing vertical positions
+                y_coords = [pos[1] for pos in car_positions]
+                y_clusters = {}
+                # Simplified clustering - group cars by similar y-coordinates
+                for i, y in enumerate(y_coords):
+                    assigned = False
+                    for cluster_y in y_clusters.keys():
+                        if abs(y - cluster_y) < 0.1:  # Within 10% of image height
+                            y_clusters[cluster_y].append(i)
+                            assigned = True
+                            break
+                    if not assigned:
+                        y_clusters[y] = [i]
+                # If we have row patterns
+                if max(len(indices) for indices in y_clusters.values()) >= 2:
+                    zones["parking_row"] = {
+                        "region": "central",
+                        "objects": ["car"] * len(car_objs),
+                        "description": f"Organized parking area with vehicles arranged in rows"
+                    }
+                else:
+                    zones["parking_area"] = {
+                        "region": "wide",
+                        "objects": ["car"] * len(car_objs),
+                        "description": f"Parking area with {len(car_objs)} vehicles"
+                    }
+        return zones
+    def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
+        """
+        Identify general functional zones when no specific scene type is matched.
+        Args:
+            category_regions: Objects grouped by category and region
+            detected_objects: List of detected objects
+        Returns:
+            Dict: Default functional zones
+        """
+        zones = {}
+        # Group objects by category and find main concentrations
+        for category, regions in category_regions.items():
+            if not regions:
+                continue
+            # Find region with most objects in this category
+            main_region = max(regions.items(),
+                        key=lambda x: len(x[1]),
+                        default=(None, []))
+            if main_region[0] is None or len(main_region[1]) < 2:
+                continue
+            # Create zone based on object category
+            zone_objects = [obj["class_name"] for obj in main_region[1]]
+            # Skip if too few objects
+            if len(zone_objects) < 2:
+                continue
+            # Create appropriate zone name and description based on category
+            if category == "furniture":
+                zones["furniture_zone"] = {
+                    "region": main_region[0],
+                    "objects": zone_objects,
+                    "description": f"Area with furniture including {', '.join(zone_objects[:3])}"
+                }
+            elif category == "electronics":
+                zones["electronics_zone"] = {
+                    "region": main_region[0],
+                    "objects": zone_objects,
+                    "description": f"Area with electronic devices including {', '.join(zone_objects[:3])}"
+                }
+            elif category == "kitchen_items":
+                zones["dining_zone"] = {
+                    "region": main_region[0],
+                    "objects": zone_objects,
+                    "description": f"Dining or food area with {', '.join(zone_objects[:3])}"
+                }
+            elif category == "vehicles":
+                zones["vehicle_zone"] = {
+                    "region": main_region[0],
+                    "objects": zone_objects,
+                    "description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
+                }
+            elif category == "personal_items":
+                zones["personal_items_zone"] = {
+                    "region": main_region[0],
+                    "objects": zone_objects,
+                    "description": f"Area with personal items including {', '.join(zone_objects[:3])}"
+                }
+        # Check for people groups
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
+        if len(people_objs) >= 2:
+            people_regions = {}
+            for obj in people_objs:
+                region = obj["region"]
+                if region not in people_regions:
+                    people_regions[region] = []
+                people_regions[region].append(obj)
+            if people_regions:
+                main_people_region = max(people_regions.items(),
+                                    key=lambda x: len(x[1]),
+                                    default=(None, []))
+                if main_people_region[0] is not None:
+                    zones["people_zone"] = {
+                        "region": main_people_region[0],
+                        "objects": ["person"] * len(main_people_region[1]),
+                        "description": f"Area with {len(main_people_region[1])} people"
+                    }
+        return zones
+    def _find_main_region(self, region_objects_dict: Dict) -> str:
+        """Find the main region with the most objects"""
+        if not region_objects_dict:
+            return "unknown"
+        return max(region_objects_dict.items(),
+                key=lambda x: len(x[1]),
+                default=("unknown", []))[0]
+    def _find_main_region(self, region_objects_dict: Dict) -> str:
+        """Find the main region with the most objects"""
+        if not region_objects_dict:
+            return "unknown"
+        return max(region_objects_dict.items(),
+                 key=lambda x: len(x[1]),
+                 default=("unknown", []))[0]

street_04.jpg ADDED Viewed

Git LFS Details

SHA256: 1eb06464cdb80a96171d511f985b57b79c32df0f6cae38dfbc08e5cd4fb0acec
Pointer size: 132 Bytes
Size of remote file: 5.61 MB

style.py CHANGED Viewed

@@ -1,7 +1,9 @@
 class Style:
     @staticmethod
     def get_css():
-        """Return the application's CSS styles with improved aesthetics"""
         css = """
         /* Base styles and typography */
         body {
@@ -13,20 +15,20 @@ class Style:
             justify-content: center;
             min-height: 100vh;
         }
         /* Typography improvements */
         h1, h2, h3, h4, h5, h6, p, span, div, label, button {
             font-family: Arial, sans-serif;
         }
         /* Container styling */
         .gradio-container {
             max-width: 1200px !important;
-            margin: 0 auto;
             padding: 1rem;
             width: 100%;
         }
         /* Header area styling with gradient background */
         .app-header {
             text-align: center;
@@ -37,7 +39,7 @@ class Style:
             box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
             width: 100%;
         }
         .app-title {
             color: #2D3748;
             font-size: 2.5rem;
@@ -47,21 +49,21 @@ class Style:
             -webkit-text-fill-color: transparent;
             font-weight: bold;
         }
         .app-subtitle {
             color: #4A5568;
             font-size: 1.2rem;
             font-weight: normal;
             margin-top: 0.25rem;
         }
         .app-divider {
             width: 80px;
             height: 3px;
             background: linear-gradient(90deg, #38b2ac, #4299e1);
             margin: 1rem auto;
         }
         /* Panel styling - gradient background */
         .input-panel, .output-panel {
             background: white;
@@ -70,20 +72,20 @@ class Style:
             box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
             margin: 0 auto 1rem auto;
         }
-        /* Section heading styling with gradient background */
-        .section-heading {
-            font-size: 1.25rem;
-            font-weight: 600;
-            color: #2D3748;
-            margin-bottom: 1rem;
-            margin-top: 0.5rem;
-            text-align: center;
-            padding: 0.8rem;
-            background: linear-gradient(to right, #e6f3fc, #f0f9ff);
-            border-radius: 8px;
         }
         /* How-to-use section with gradient background */
         .how-to-use {
             background: linear-gradient(135deg, #f8fafc, #e8f4fd);
@@ -93,7 +95,7 @@ class Style:
             box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
             color: #2d3748;
         }
         /* Detection button styling */
         .detect-btn {
             background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
@@ -108,41 +110,40 @@ class Style:
             margin: 1rem auto !important;
             font-family: Arial, sans-serif !important;
         }
         .detect-btn:hover {
             transform: translateY(-2px) !important;
             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2) !important;
         }
         .detect-btn:active {
             transform: translateY(1px) !important;
             box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2) !important;
         }
         /* JSON display improvements */
-        .json-display pre {
-            background: #f8fafc;
-            border-radius: 6px;
-            padding: 1rem;
-            font-family: 'Consolas', 'Monaco', monospace;
-            white-space: pre-wrap;
-            max-height: 500px;
-            overflow-y: auto;
-            box-shadow: inset 0 0 4px rgba(0, 0, 0, 0.1);
         }
         .json-key {
             color: #e53e3e;
         }
         .json-value {
             color: #2b6cb0;
         }
         .json-string {
             color: #38a169;
         }
         /* Chart/plot styling improvements */
         .plot-container {
             background: white;
@@ -150,32 +151,39 @@ class Style:
             padding: 0.5rem;
             box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
         }
         /* Larger font for plots */
         .plot-container text {
             font-family: Arial, sans-serif !important;
             font-size: 14px !important;
         }
         /* Title styling for charts */
         .plot-title {
             font-family: Arial, sans-serif !important;
             font-size: 16px !important;
             font-weight: bold !important;
         }
         /* Tab styling with subtle gradient */
         .tabs {
             width: 100%;
             display: flex;
             justify-content: center;
         }
         .tabs > div:first-child {
             background: linear-gradient(to right, #f8fafc, #e8f4fd) !important;
             border-radius: 8px 8px 0 0;
         }
         /* Footer styling with gradient background */
         .footer {
             text-align: center;
@@ -188,7 +196,7 @@ class Style:
             box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
             width: 100%;
         }
         /* Ensure centering works for all elements */
         .container, .gr-container, .gr-row, .gr-col {
             display: flex;
@@ -197,86 +205,175 @@ class Style:
             justify-content: center;
             width: 100%;
         }
-        /* 結果文本框的改進樣式 */
-        #detection-details, .wide-result-text {
             width: 100% !important;
             max-width: 100% !important;
             box-sizing: border-box !important;
         }
-        .wide-result-text textarea {
             width: 100% !important;
-            min-width: 600px !important;
             font-family: 'Arial', sans-serif !important;
             font-size: 14px !important;
-            line-height: 1.5 !important;  /* 減少行間距 */
             padding: 16px !important;
             white-space: pre-wrap !important;
-            background-color: #f8f9fa !important;
             border-radius: 8px !important;
-            min-height: 300px !important;
-            resize: none !important;
             overflow-y: auto !important;
             border: 1px solid #e2e8f0 !important;
             display: block !important;
         }
-        /* 結果詳情面板樣式 - 加入漸層背景 */
-        .result-details-box {
-            width: 100% !important;
-            margin-top: 1.5rem;
-            background: linear-gradient(135deg, #f8fafc, #e8f4fd);
-            border-radius: 10px;
-            padding: 1rem;
-            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
-        }
-        /* 確保結果詳情面板內的元素寬度可以適應面板 */
-        .result-details-box > * {
             width: 100% !important;
             max-width: 100% !important;
         }
-        /* 確保文本區域不會被限制寬度 */
-        .result-details-box .gr-text-input {
             width: 100% !important;
-            max-width: none !important;
         }
-        /* 輸出面板內容的布局調整 */
-        .output-panel {
-            display: flex;
-            flex-direction: column;
-            width: 100%;
             padding: 0 !important;
         }
-        /* 確保結果面板內的元素寬度可以適應面板 */
-        .output-panel > * {
-            width: 100%;
         }
-        /* 改善統計面板列佈局 */
         .plot-column, .stats-column {
             display: flex;
             flex-direction: column;
             padding: 1rem;
         }
-        /* Responsive adjustments */
         @media (max-width: 768px) {
             .app-title {
                 font-size: 2rem;
             }
             .app-subtitle {
                 font-size: 1rem;
             }
             .gradio-container {
                 padding: 0.5rem;
             }
         }
         """
         return css

 class Style:
     @staticmethod
     def get_css():
         css = """
         /* Base styles and typography */
         body {
             justify-content: center;
             min-height: 100vh;
         }
         /* Typography improvements */
         h1, h2, h3, h4, h5, h6, p, span, div, label, button {
             font-family: Arial, sans-serif;
         }
         /* Container styling */
         .gradio-container {
             max-width: 1200px !important;
+            margin: auto !important;
             padding: 1rem;
             width: 100%;
         }
         /* Header area styling with gradient background */
         .app-header {
             text-align: center;
             box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
             width: 100%;
         }
         .app-title {
             color: #2D3748;
             font-size: 2.5rem;
             -webkit-text-fill-color: transparent;
             font-weight: bold;
         }
         .app-subtitle {
             color: #4A5568;
             font-size: 1.2rem;
             font-weight: normal;
             margin-top: 0.25rem;
         }
         .app-divider {
             width: 80px;
             height: 3px;
             background: linear-gradient(90deg, #38b2ac, #4299e1);
             margin: 1rem auto;
         }
         /* Panel styling - gradient background */
         .input-panel, .output-panel {
             background: white;
             box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
             margin: 0 auto 1rem auto;
         }
+        /* 修改輸出面板確保內容能夠完整顯示 */
+        .output-panel {
+            display: flex;
+            flex-direction: column;
+            width: 100%;
+            padding: 0 !important;
         }
+        /* 確保輸出面板內的元素寬度可以適應面板 */
+        .output-panel > * {
+            width: 100%;
+        }
         /* How-to-use section with gradient background */
         .how-to-use {
             background: linear-gradient(135deg, #f8fafc, #e8f4fd);
             box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
             color: #2d3748;
         }
         /* Detection button styling */
         .detect-btn {
             background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
             margin: 1rem auto !important;
             font-family: Arial, sans-serif !important;
         }
         .detect-btn:hover {
             transform: translateY(-2px) !important;
             box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2) !important;
         }
         .detect-btn:active {
             transform: translateY(1px) !important;
             box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2) !important;
         }
         /* JSON display improvements */
+        .json-display {
+            width: 98% !important;
+            margin: 0.5rem auto 1.5rem auto !important;
+            padding: 1rem !important;
+            border-radius: 8px !important;
+            background-color: white !important;
+            border: 1px solid #E2E8F0 !important;
+            box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.05) !important;
         }
         .json-key {
             color: #e53e3e;
         }
         .json-value {
             color: #2b6cb0;
         }
         .json-string {
             color: #38a169;
         }
         /* Chart/plot styling improvements */
         .plot-container {
             background: white;
             padding: 0.5rem;
             box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
         }
         /* Larger font for plots */
         .plot-container text {
             font-family: Arial, sans-serif !important;
             font-size: 14px !important;
         }
         /* Title styling for charts */
         .plot-title {
             font-family: Arial, sans-serif !important;
             font-size: 16px !important;
             font-weight: bold !important;
         }
         /* Tab styling with subtle gradient */
         .tabs {
             width: 100%;
             display: flex;
             justify-content: center;
         }
         .tabs > div:first-child {
             background: linear-gradient(to right, #f8fafc, #e8f4fd) !important;
             border-radius: 8px 8px 0 0;
         }
+        /* Tab content styling - 確保內容區域有足夠寬度 */
+        .tab-content {
+            width: 100% !important;
+            box-sizing: border-box !important;
+            padding: 0 !important;
+        }
         /* Footer styling with gradient background */
         .footer {
             text-align: center;
             box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
             width: 100%;
         }
         /* Ensure centering works for all elements */
         .container, .gr-container, .gr-row, .gr-col {
             display: flex;
             justify-content: center;
             width: 100%;
         }
+        /* 統一文本框樣式，確保寬度一致 */
+        .gr-textbox, .gr-textarea, .gr-text-input {
             width: 100% !important;
             max-width: 100% !important;
+            min-width: 100% !important;
             box-sizing: border-box !important;
         }
+        /* 確保文本區域可以適應容器寬度 */
+        textarea.gr-textarea, .gr-textbox textarea, .gr-text-input textarea {
             width: 100% !important;
+            max-width: 100% !important;
+            min-width: 100% !important;
+            box-sizing: border-box !important;
+            padding: 16px !important;
             font-family: 'Arial', sans-serif !important;
             font-size: 14px !important;
+            line-height: 1.6 !important;
+            white-space: pre-wrap !important;
+            word-wrap: break-word !important;
+            word-break: normal !important;
+        }
+        /* 特別針對場景描述文本框樣式增強 */
+        #scene-description-text, #detection-details {
+            width: 100% !important;
+            min-width: 100% !important;
+            box-sizing: border-box !important;
             padding: 16px !important;
+            line-height: 1.8 !important;
             white-space: pre-wrap !important;
+            word-wrap: break-word !important;
             border-radius: 8px !important;
+            min-height: 250px !important;
             overflow-y: auto !important;
             border: 1px solid #e2e8f0 !important;
+            background-color: white !important;
             display: block !important;
+            font-family: 'Arial', sans-serif !important;
+            font-size: 14px !important;
+            margin: 0 !important;
         }
+        /* 針對場景描述容器的樣式 */
+        .scene-description-container {
             width: 100% !important;
             max-width: 100% !important;
+            box-sizing: border-box !important;
+            padding: 0 !important;
+            margin: 0 !important;
+        }
+        /* Scene Understanding Tab 特定樣式 */
+        .scene-understanding-tab .result-details-box {
+            display: flex !important;
+            flex-direction: column !important;
+            align-items: stretch !important;
+            width: 100% !important;
+            box-sizing: border-box !important;
+            padding: 0 !important;
         }
+        /* 結果容器樣式 */
+        .result-container {
             width: 100% !important;
+            padding: 1rem !important;
+            border-radius: 8px !important;
+            border: 1px solid #E2E8F0 !important;
+            margin-bottom: 1.5rem !important;
+            background-color: #F8FAFC !important;
+            box-sizing: border-box !important;
         }
+        /* 結果文本框的樣式 */
+        .wide-result-text {
+            width: 100% !important;
+            min-width: 100% !important;
+            box-sizing: border-box !important;
             padding: 0 !important;
+            margin: 0 !important;
         }
+        /* 片段標題樣式 */
+        .section-heading {
+            font-size: 1.25rem !important;
+            font-weight: 600 !important;
+            color: #2D3748 !important;
+            margin: 1rem auto !important;
+            padding: 0.75rem 1rem !important;
+            background: linear-gradient(to right, #e6f3fc, #f0f9ff) !important;
+            border-radius: 8px !important;
+            width: 98% !important;
+            display: inline-block !important;
+            box-sizing: border-box !important;
+            text-align: center !important;
+            overflow: visible !important;
+            line-height: 1.5 !important;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
+        }
+        /* JSON 顯示區域樣式 */
+        .json-box {
+            width: 100% !important;
+            min-height: 200px !important;
+            overflow-y: auto !important;
+            background: white !important;
+            padding: 1rem !important;
+            border-radius: 8px !important;
+            box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
+            font-family: monospace !important;
+            box-sizing: border-box !important;
         }
+        /* 欄佈局調整 */
         .plot-column, .stats-column {
             display: flex;
             flex-direction: column;
             padding: 1rem;
+            box-sizing: border-box !important;
+            width: 100% !important;
+        }
+        /* statistics plot */
+        .large-plot-container {
+            width: 100% !important;
+            min-height: 400px !important;
+            box-sizing: border-box !important;
+        }
+        /* 增強 JSON 顯示 */
+        .enhanced-json-display {
+            background: white !important;
+            border-radius: 8px !important;
+            padding: 1rem !important;
+            box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
+            width: 100% !important;
+            min-height: 300px !important;
+            max-height: 500px !important;
+            overflow-y: auto !important;
+            font-family: monospace !important;
+            box-sizing: border-box !important;
         }
+        /* 確保全寬元素真正占滿整個寬度 */
+        .full-width-element {
+            width: 100% !important;
+            max-width: 100% !important;
+            box-sizing: border-box !important;
+        }
+        /* 響應式調整 */
         @media (max-width: 768px) {
             .app-title {
                 font-size: 2rem;
             }
             .app-subtitle {
                 font-size: 1rem;
             }
             .gradio-container {
                 padding: 0.5rem;
             }
+            /* 在小螢幕上調整文本區域的高度 */
+            #scene-description-text, #detection-details {
+                min-height: 150px !important;
+            }
         }
         """
         return css

viewpoint_templates.py ADDED Viewed

	@@ -0,0 +1,19 @@

+VIEWPOINT_TEMPLATES = {
+    "eye_level": {
+        "prefix": "From a standard eye-level perspective, ",
+        "observation": "the scene shows {scene_elements} arranged in a typical front-facing view."
+    },
+    "aerial": {
+        "prefix": "From an aerial perspective, ",
+        "observation": "the scene shows {scene_elements} as viewed from above, revealing the spatial layout."
+    },
+    "elevated": {
+        "prefix": "From an elevated viewpoint, ",
+        "observation": "the scene presents {scene_elements} with a slight downward angle."
+    },
+    "low_angle": {
+        "prefix": "From a low angle, ",
+        "observation": "the scene depicts {scene_elements} from below, emphasizing vertical elements."
+    }
+}

visualization_helper.py CHANGED Viewed

@@ -74,7 +74,7 @@ class VisualizationHelper:
         for box, cls, conf in zip(boxes, classes, confs):
             x1, y1, x2, y2 = box
             cls_id = int(cls)
             if filter_classes and cls_id not in filter_classes:
                 continue

         for box, cls, conf in zip(boxes, classes, confs):
             x1, y1, x2, y2 = box
             cls_id = int(cls)
             if filter_classes and cls_id not in filter_classes:
                 continue