Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on May 8

Commit

c0fe80d

verified ·

1 Parent(s): b62df99

Add new feature "Video Process" and fix format issue

Browse files

Files changed (8) hide show

app.py +514 -360
clip_analyzer.py +2 -1
enhance_scene_describer.py +257 -69
lighting_analyzer.py +71 -71
requirements.txt +2 -0
scene_analyzer.py +2 -13
style.py +140 -0
video_processor.py +346 -0

app.py CHANGED Viewed

@@ -3,6 +3,10 @@ import numpy as np
 import matplotlib.pyplot as plt
 import gradio as gr
 from typing import Dict, List, Any, Optional, Tuple
 import spaces
 from detection_model import DetectionModel
@@ -10,441 +14,591 @@ from color_mapper import ColorMapper
 from evaluation_metrics import EvaluationMetrics
 from style import Style
 from image_processor import ImageProcessor
-# Initialize image processor
 image_processor = ImageProcessor()
 def get_all_classes():
-    """
-    Get all available COCO classes from the currently active model or fallback to standard COCO classes
-    Returns:
-        List of tuples (class_id, class_name)
-    """
-    # Try to get class names from any loaded model
-    for model_name, model_instance in image_processor.model_instances.items():
-        if model_instance and model_instance.is_model_loaded:
-            try:
-                class_names = model_instance.class_names
-                return [(idx, name) for idx, name in class_names.items()]
-            except Exception:
-                pass
-    # Fallback to standard COCO classes
-    return [
-        (0, 'person'), (1, 'bicycle'), (2, 'car'), (3, 'motorcycle'), (4, 'airplane'),
-        (5, 'bus'), (6, 'train'), (7, 'truck'), (8, 'boat'), (9, 'traffic light'),
-        (10, 'fire hydrant'), (11, 'stop sign'), (12, 'parking meter'), (13, 'bench'),
-        (14, 'bird'), (15, 'cat'), (16, 'dog'), (17, 'horse'), (18, 'sheep'), (19, 'cow'),
-        (20, 'elephant'), (21, 'bear'), (22, 'zebra'), (23, 'giraffe'), (24, 'backpack'),
-        (25, 'umbrella'), (26, 'handbag'), (27, 'tie'), (28, 'suitcase'), (29, 'frisbee'),
-        (30, 'skis'), (31, 'snowboard'), (32, 'sports ball'), (33, 'kite'), (34, 'baseball bat'),
-        (35, 'baseball glove'), (36, 'skateboard'), (37, 'surfboard'), (38, 'tennis racket'),
-        (39, 'bottle'), (40, 'wine glass'), (41, 'cup'), (42, 'fork'), (43, 'knife'),
-        (44, 'spoon'), (45, 'bowl'), (46, 'banana'), (47, 'apple'), (48, 'sandwich'),
-        (49, 'orange'), (50, 'broccoli'), (51, 'carrot'), (52, 'hot dog'), (53, 'pizza'),
-        (54, 'donut'), (55, 'cake'), (56, 'chair'), (57, 'couch'), (58, 'potted plant'),
-        (59, 'bed'), (60, 'dining table'), (61, 'toilet'), (62, 'tv'), (63, 'laptop'),
-        (64, 'mouse'), (65, 'remote'), (66, 'keyboard'), (67, 'cell phone'), (68, 'microwave'),
-        (69, 'oven'), (70, 'toaster'), (71, 'sink'), (72, 'refrigerator'), (73, 'book'),
-        (74, 'clock'), (75, 'vase'), (76, 'scissors'), (77, 'teddy bear'), (78, 'hair drier'),
-        (79, 'toothbrush')
-    ]
 @spaces.GPU
-def process_and_plot(image, model_name, confidence_threshold, filter_classes=None):
-    """
-    Process image and create plots for statistics with enhanced visualization
-    Args:
-        image: Input image
-        model_name: Name of the model to use
-        confidence_threshold: Confidence threshold for detection
-        filter_classes: Optional list of classes to filter results
-    Returns:
-        Tuple of results including lighting conditions
-    """
     try:
-        class_ids = None
         if filter_classes:
-            class_ids = []
             for class_str in filter_classes:
                 try:
-                    # Extract ID from format "id: name"
-                    class_id = int(class_str.split(":")[0].strip())
-                    class_ids.append(class_id)
-                except:
-                    continue
-        # Execute detection
         result_image, result_text, stats = image_processor.process_image(
             image,
             model_name,
             confidence_threshold,
-            class_ids
         )
-        # Format the statistics for better display
         formatted_stats = image_processor.format_json_for_display(stats)
-        if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
-            # Create the table
             fig, ax = plt.subplots(figsize=(8, 6))
-            ax.text(0.5, 0.5, "No detection data available",
-                    ha='center', va='center', fontsize=14, fontfamily='Arial')
-            ax.set_xlim(0, 1)
-            ax.set_ylim(0, 1)
             ax.axis('off')
             plot_figure = fig
-        else:
-            # Prepare visualization data
-            available_classes = dict(get_all_classes())
-            viz_data = image_processor.prepare_visualization_data(stats, available_classes)
-            # Create plot
-            plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
         # Extract scene analysis info
         scene_analysis = stats.get("scene_analysis", {})
-        scene_desc = scene_analysis.get("description", "No scene analysis available.")
-        scene_desc = scene_desc.strip()
-        # HTML format
-        scene_desc_html = f"""
-        <div id='scene-desc-container' style='width:100%; padding:20px; text-align:center; background-color:#f5f9fc; border-radius:8px; margin:10px auto; min-height:200px; max-height:none; overflow-y:auto;'>
-            <div style='width:100%; text-align:center; margin:0 auto; font-family:Arial, sans-serif; font-size:14px; line-height:1.8;'>
-                {scene_desc}
-            </div>
-        </div>
-        """
-        # Extract lighting conditions
-        lighting_conditions = scene_analysis.get("lighting_conditions",
-                                               {"time_of_day": "unknown", "confidence": 0.0})
-        # 準備活動列表
-        activities = scene_analysis.get("possible_activities", [])
-        if not activities:
-            activities_data = [["No activities detected"]]
         else:
-            activities_data = [[activity] for activity in activities]
-        # 準備安全注意事項列表
-        safety_concerns = scene_analysis.get("safety_concerns", [])
-        if not safety_concerns:
-            safety_data = [["No safety concerns detected"]]
         else:
-            safety_data = [[concern] for concern in safety_concerns]
-        # 功能區域
         zones = scene_analysis.get("functional_zones", {})
-        return result_image, result_text, formatted_stats, plot_figure, scene_desc, activities_data, safety_data, zones, lighting_conditions
     except Exception as e:
-        # 確保即使出錯也能返回有效的數據
         import traceback
         error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        # 創建一個簡單的錯誤圖
-        fig, ax = plt.subplots(figsize=(8, 6))
-        ax.text(0.5, 0.5, f"Error: {str(e)}",
-                ha='center', va='center', fontsize=14, fontfamily='Arial', color='red')
-        ax.set_xlim(0, 1)
-        ax.set_ylim(0, 1)
         ax.axis('off')
-        # 返回有效的默認值
-        return None, error_msg, "{}", fig, "Error processing image", [["No activities"]], [["No safety concerns"]], {}, {"time_of_day": "unknown", "confidence": 0}
 def create_interface():
-    """創建 Gradio 界面"""
     css = Style.get_css()
-    # 獲取可用模型信息
     available_models = DetectionModel.get_available_models()
     model_choices = [model["model_file"] for model in available_models]
-    model_labels = [f"{model['name']} - {model['inference_speed']}" for model in available_models]
-    # 可用類別過濾選項
-    available_classes = get_all_classes()
-    class_choices = [f"{id}: {name}" for id, name in available_classes]
-    # 創建 Gradio Blocks 界面
     with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
-        # 主頁頂部的標題
         with gr.Group(elem_classes="app-header"):
               gr.HTML("""
                     <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
                         <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
-                        <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Detect and identify objects in your images</h2>
-                        <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;">
-                            <div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div>
-                        </div>
                         <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
-                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
-                                <span style="margin-right: 6px;">🔍</span> Object Detection
-                            </div>
-                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
-                                <span style="margin-right: 6px;">🌐</span> Scene Understanding
-                            </div>
-                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
-                                <span style="margin-right: 6px;">📊</span> Visual Analysis
-                            </div>
-                        </div>
-                        <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
-                            <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
-                                <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images are not supported.
-                                <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG here</a> before uploading.
-                            </p>
                         </div>
                     </div>
                 """)
-        current_model = gr.State("yolov8m.pt")  # use medium size model as defualt
-        # 主要內容區
-        with gr.Row(equal_height=True):
-            # 左側 - 輸入控制區(可上傳圖片)
-            with gr.Column(scale=4, elem_classes="input-panel"):
-                with gr.Group():
-                    gr.HTML('<div class="section-heading">Upload Image</div>')
-                    image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")
-                    with gr.Accordion("Advanced Settings", open=False):
-                        with gr.Row():
-                            model_dropdown = gr.Dropdown(
-                                choices=model_choices,
-                                value="yolov8m.pt",
-                                label="Select Model",
-                                info="Choose different models based on your needs for speed vs. accuracy"
-                            )
-                        # display model info
-                        model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))
-                        confidence = gr.Slider(
-                            minimum=0.1,
-                            maximum=0.9,
-                            value=0.25,
-                            step=0.05,
-                            label="Confidence Threshold",
-                            info="Higher values show fewer but more confident detections"
-                        )
-                        with gr.Accordion("Filter Classes", open=False):
-                            # 常見物件類別快速選擇按鈕
-                            gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
-                            with gr.Row():
-                                people_btn = gr.Button("People", size="sm")
-                                vehicles_btn = gr.Button("Vehicles", size="sm")
-                                animals_btn = gr.Button("Animals", size="sm")
-                                objects_btn = gr.Button("Common Objects", size="sm")
-                            # 類別選擇下拉框
-                            class_filter = gr.Dropdown(
-                                choices=class_choices,
-                                multiselect=True,
-                                label="Select Classes to Display",
-                                info="Leave empty to show all detected objects"
                             )
-                    # detect buttom
-                    detect_btn = gr.Button("Detect Objects", variant="primary", elem_classes="detect-btn")
-                # 使用說明區
-                with gr.Group(elem_classes="how-to-use"):
-                    gr.HTML('<div class="section-heading">How to Use</div>')
-                    gr.Markdown("""
-                    1. Upload an image or use the camera
-                    2. (Optional) Adjust settings like confidence threshold or model size (n, m, x)
-                    3. Optionally filter to specific object classes
-                    4. Click "Detect Objects" button
-                    The model will identify objects in your image and display them with bounding boxes.
-                    **Note:** Detection quality depends on image clarity and model settings.
-                    """)
-            # 右側 - 結果顯示區
-            with gr.Column(scale=6, elem_classes="output-panel"):
-                with gr.Tabs(elem_classes="tabs"):
-                    with gr.Tab("Detection Result"):
-                        result_image = gr.Image(type="pil", label="Detection Result")
-                        # details summary
-                        with gr.Group(elem_classes="result-details-box"):
-                            gr.HTML('<div class="section-heading">Detection Details</div>')
-                            # 文本框設置，讓顯示會更寬
-                            result_text = gr.Textbox(
-                                label=None,
-                                lines=15,
-                                max_lines=20,
-                                elem_classes="wide-result-text",
-                                elem_id="detection-details",
-                                container=False,
-                                scale=2,
-                                min_width=600
-                            )
-                    # Scene Analysis
-                    with gr.Tab("Scene Understanding", elem_classes="scene-understanding-tab"):
-                        with gr.Group(elem_classes="result-details-box"):
-                            gr.HTML("""
-                                <div class="section-heading">Scene Analysis</div>
-                                <details class="info-details" style="margin: 5px 0 15px 0;">
-                                    <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
-                                        🔍 The AI Vision Scout Report: Click for important notes about this analysis
-                                    </summary>
-                                    <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
-                                        <p style="font-size: 13px; color: #718096; margin: 0;">
-                                            <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
-                                            Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
-                                            Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
                                         </p>
                                     </div>
-                                </details>
-                            """)
-                            # 使用更適合長文本的容器
-                            with gr.Group(elem_classes="scene-description-container"):
-                                scene_description = gr.HTML(
-                                        value="<div id='scene-desc-container'></div>",
-                                        label="Scene Description"
-                                    )
-                            with gr.Row():
-                                with gr.Column(scale=2):
-                                    activities_list = gr.Dataframe(
-                                        headers=["Activities"],
-                                        datatype=["str"],
-                                        col_count=1,
-                                        row_count=5,
-                                        elem_classes="full-width-element"
-                                    )
-                                with gr.Column(scale=2):
-                                    safety_list = gr.Dataframe(
-                                        headers=["Safety Concerns"],
-                                        datatype=["str"],
-                                        col_count=1,
-                                        row_count=5,
-                                        elem_classes="full-width-element"
-                                    )
-                            gr.HTML('<div class="section-heading">Functional Zones</div>')
-                            zones_json = gr.JSON(label=None, elem_classes="json-box")
-                            gr.HTML('<div class="section-heading">Lighting Conditions</div>')
-                            lighting_info = gr.JSON(label=None, elem_classes="json-box")
-                    with gr.Tab("Statistics"):
-                        with gr.Row():
-                            with gr.Column(scale=3, elem_classes="plot-column"):
-                                gr.HTML('<div class="section-heading">Object Distribution</div>')
-                                plot_output = gr.Plot(
-                                    label=None,
-                                    elem_classes="large-plot-container"
                                 )
-                            # 右側放 JSON 數據比較清晰
-                            with gr.Column(scale=2, elem_classes="stats-column"):
-                                gr.HTML('<div class="section-heading">Detection Statistics</div>')
-                                stats_json = gr.JSON(
-                                    label=None,  # remove label
-                                    elem_classes="enhanced-json-display"
                                 )
-        detect_btn.click(
-                fn=process_and_plot,
-                inputs=[image_input, current_model, confidence, class_filter],
-                outputs=[
-                    result_image, result_text, stats_json, plot_output,
-                    scene_description, activities_list, safety_list, zones_json,
-                    lighting_info
-                ]
-            )
-        # model option
-        model_dropdown.change(
-            fn=lambda model: (model, DetectionModel.get_model_description(model)),
-            inputs=[model_dropdown],
-            outputs=[current_model, model_info]
-        )
-        # each classes link
-        people_classes = [0]  # 人
-        vehicles_classes = [1, 2, 3, 4, 5, 6, 7, 8]  # 各種車輛
-        animals_classes = list(range(14, 24))  # COCO 中的動物
-        common_objects = [41, 42, 43, 44, 45, 67, 73, 74, 76]  # 常見家居物品
-        # Linked the quik buttom
-        people_btn.click(
-            lambda: [f"{id}: {name}" for id, name in available_classes if id in people_classes],
-            outputs=class_filter
-        )
-        vehicles_btn.click(
-            lambda: [f"{id}: {name}" for id, name in available_classes if id in vehicles_classes],
-            outputs=class_filter
         )
-        animals_btn.click(
-            lambda: [f"{id}: {name}" for id, name in available_classes if id in animals_classes],
-            outputs=class_filter
         )
-        objects_btn.click(
-            lambda: [f"{id}: {name}" for id, name in available_classes if id in common_objects],
-            outputs=class_filter
         )
-        example_images = [
-            "room_01.jpg",
-            "room_02.jpg",
-            "street_02.jpg",
-            "street_04.jpg"
-        ]
-        # add example images
-        gr.Examples(
-            examples=example_images,
-            inputs=image_input,
-            outputs=None,
-            fn=None,
-            cache_examples=False,
         )
         # Footer
         gr.HTML("""
-            <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
-                <div style="margin-bottom: 15px;">
-                    <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP and Ultralytics • Created with Gradio</p>
-                    <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Model can detect 80 different classes of objects</p>
-                </div>
-                <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
-                    <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
-                    <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" style="text-decoration: none;">
-                        <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
-                    </a>
-                </div>
-            </div>
-        """)
     return demo
 if __name__ == "__main__":
-    import time
-    demo = create_interface()
-    demo.launch()

 import matplotlib.pyplot as plt
 import gradio as gr
 from typing import Dict, List, Any, Optional, Tuple
+import cv2
+from PIL import Image
+import tempfile
+import uuid
 import spaces
 from detection_model import DetectionModel
 from evaluation_metrics import EvaluationMetrics
 from style import Style
 from image_processor import ImageProcessor
+from video_processor import VideoProcessor
+# Initialize Processors
 image_processor = ImageProcessor()
+video_processor = VideoProcessor(image_processor)
+# Helper Function
 def get_all_classes():
+    """Gets all available COCO classes."""
+    # Try to get from a loaded model first
+    if image_processor and image_processor.model_instances:
+         for model_instance in image_processor.model_instances.values():
+              if model_instance and model_instance.is_model_loaded:
+                   try:
+                        # Ensure class_names is a dict {id: name}
+                        if isinstance(model_instance.class_names, dict):
+                             return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
+                   except Exception as e:
+                        print(f"Error getting class names from model: {e}")
+    # Fallback to standard COCO (ensure keys are ints)
+    default_classes = {
+        0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
+        6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
+        11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
+        16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
+        22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
+        27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
+        32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
+        36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
+        40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
+        46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
+        51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
+        57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
+        62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
+        67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
+        72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
+        77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
+    }
+    return sorted(default_classes.items())
 @spaces.GPU
+def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None):
+    """Processes a single uploaded image."""
+    print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}")
     try:
+        class_ids_to_filter = None
         if filter_classes:
+            class_ids_to_filter = []
+            available_classes_dict = dict(get_all_classes())
+            name_to_id = {name: id for id, name in available_classes_dict.items()}
             for class_str in filter_classes:
+                class_name_or_id = class_str.split(":")[0].strip()
+                class_id = -1
                 try:
+                    class_id = int(class_name_or_id)
+                    if class_id not in available_classes_dict:
+                        class_id = -1
+                except ValueError:
+                    if class_name_or_id in name_to_id:
+                        class_id = name_to_id[class_name_or_id]
+                    elif class_str in name_to_id: # Check full string "id: name"
+                        class_id = name_to_id[class_str]
+                if class_id != -1:
+                    class_ids_to_filter.append(class_id)
+                else:
+                    print(f"Warning: Could not parse class filter: {class_str}")
+            print(f"Filtering image results for class IDs: {class_ids_to_filter}")
+        # Call the existing image processing logic
         result_image, result_text, stats = image_processor.process_image(
             image,
             model_name,
             confidence_threshold,
+            class_ids_to_filter
         )
+        # Format stats for JSON display
         formatted_stats = image_processor.format_json_for_display(stats)
+        # Prepare visualization data for the plot
+        plot_figure = None
+        if stats and "class_statistics" in stats and stats["class_statistics"]:
+            available_classes_dict = dict(get_all_classes())
+            viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
+            if "error" not in viz_data:
+                 plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
+            else:
+                 fig, ax = plt.subplots(figsize=(8, 6))
+                 ax.text(0.5, 0.5, viz_data["error"], ha='center', va='center', fontsize=12)
+                 ax.axis('off')
+                 plot_figure = fig
+        else:
             fig, ax = plt.subplots(figsize=(8, 6))
+            ax.text(0.5, 0.5, "No detection data for plot", ha='center', va='center', fontsize=12)
             ax.axis('off')
             plot_figure = fig
         # Extract scene analysis info
         scene_analysis = stats.get("scene_analysis", {})
+        scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
+        # Ensure scene_desc is a string before adding HTML
+        if not isinstance(scene_desc, str):
+             scene_desc = str(scene_desc)
+        scene_desc_html = f"<div style='padding:10px; font-family:Arial, sans-serif; line-height:1.7;'>{scene_desc}</div>"
+        # Prepare activities list
+        activities_list = scene_analysis.get("possible_activities", [])
+        if not activities_list:
+            activities_list_data = [["No specific activities inferred"]] # Data for Dataframe
         else:
+            activities_list_data = [[activity] for activity in activities_list]
+        # Prepare safety concerns list
+        safety_concerns_list = scene_analysis.get("safety_concerns", [])
+        if not safety_concerns_list:
+            safety_data = [["No safety concerns detected"]] # Data for Dataframe
         else:
+            safety_data = [[concern] for concern in safety_concerns_list]
         zones = scene_analysis.get("functional_zones", {})
+        lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})
+        return (result_image, result_text, formatted_stats, plot_figure,
+                scene_desc_html, activities_list_data, safety_data, zones, lighting)
     except Exception as e:
+        print(f"Error in handle_image_upload: {e}")
         import traceback
         error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
+        fig, ax = plt.subplots()
+        ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
         ax.axis('off')
+        # Ensure return structure matches outputs even on error
+        return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>",
+                [["Error"]], [["Error"]], {}, {"time_of_day": "error", "confidence": 0})
+def download_video_from_url(video_url, max_duration_minutes=10):
+    """
+    Downloads a video from a YouTube URL and returns the local path to the downloaded file.
+    Args:
+        video_url (str): URL of the YouTube video to download
+        max_duration_minutes (int): Maximum allowed video duration in minutes
+    Returns:
+        tuple: (Path to the downloaded video file or None, Error message or None)
+    """
+    try:
+        # Create a temporary directory to store the video
+        temp_dir = tempfile.gettempdir()
+        output_filename = f"downloaded_{uuid.uuid4().hex}.mp4"
+        output_path = os.path.join(temp_dir, output_filename)
+        # Check if it's a YouTube URL
+        if "youtube.com" in video_url or "youtu.be" in video_url:
+            # Import yt-dlp here to avoid dependency if not needed
+            import yt_dlp
+            # Setup yt-dlp options
+            ydl_opts = {
+                'format': 'best[ext=mp4]/best',  # Best quality MP4 or best available format
+                'outtmpl': output_path,
+                'noplaylist': True,
+                'quiet': False,  # Set to True to reduce output
+                'no_warnings': False,
+            }
+            # First extract info to check duration
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                print(f"Extracting info from YouTube URL: {video_url}")
+                info_dict = ydl.extract_info(video_url, download=False)
+                # Check if video exists
+                if not info_dict:
+                    return None, "Could not retrieve video information. Please check the URL."
+                video_title = info_dict.get('title', 'Unknown Title')
+                duration = info_dict.get('duration', 0)
+                print(f"Video title: {video_title}")
+                print(f"Video duration: {duration} seconds")
+                # Check video duration
+                if duration > max_duration_minutes * 60:
+                    return None, f"Video is too long ({duration} seconds). Maximum duration is {max_duration_minutes} minutes."
+                # Download the video
+                print(f"Downloading YouTube video: {video_title}")
+                ydl.download([video_url])
+            # Verify the file exists and has content
+            if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
+                return None, "Download failed: Empty or missing file."
+            print(f"Successfully downloaded video to: {output_path}")
+            return output_path, None
+        else:
+            return None, "Only YouTube URLs are supported at this time. Please enter a valid YouTube URL."
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Error downloading video: {e}\n{error_details}")
+        return None, f"Error downloading video: {str(e)}"
+@spaces.GPU
+def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
+    """Handles video upload or URL input and calls the VideoProcessor."""
+    print(f"Received video request: input_type={input_type}")
+    video_path = None
+    # Handle based on input type
+    if input_type == "upload" and video_input:
+        print(f"Processing uploaded video file")
+        video_path = video_input
+    elif input_type == "url" and video_url:
+        print(f"Processing video from URL: {video_url}")
+        # Download video from URL
+        video_path, error_message = download_video_from_url(video_url)
+        if error_message:
+            error_html = f"<div class='video-summary-content-wrapper'><pre>{error_message}</pre></div>"
+            return None, error_html, {"error": error_message}
+    else:
+        print("No valid video input provided.")
+        return None, "<div class='video-summary-content-wrapper'><pre>Please upload a video file or provide a valid video URL.</pre></div>", {}
+    print(f"Starting video processing with: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
+    try:
+        # Call the VideoProcessor method
+        output_video_path, summary_text, stats_dict = video_processor.process_video_file(
+            video_path=video_path,
+            model_name=model_name,
+            confidence_threshold=confidence_threshold,
+            process_interval=int(process_interval) # Ensure interval is int
+        )
+        print(f"Video processing function returned: path={output_video_path}, summary length={len(summary_text)}")
+        # Wrap processing summary in HTML tags for consistent styling with scene understanding page
+        summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_text}</pre></div>"
+        # Format statistics for better display
+        formatted_stats = {}
+        if stats_dict and isinstance(stats_dict, dict):
+            formatted_stats = stats_dict
+        return output_video_path, summary_html, formatted_stats
+    except Exception as e:
+        print(f"Error in handle_video_upload: {e}")
+        import traceback
+        error_msg = f"Error processing video: {str(e)}\n{traceback.format_exc()}"
+        error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
+        return None, error_html, {"error": str(e)}
+# Create Gradio Interface
 def create_interface():
+    """Creates the Gradio interface with Tabs."""
     css = Style.get_css()
     available_models = DetectionModel.get_available_models()
     model_choices = [model["model_file"] for model in available_models]
+    class_choices_formatted = [f"{id}: {name}" for id, name in get_all_classes()] # Use formatted choices
     with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
+        # Header
         with gr.Group(elem_classes="app-header"):
               gr.HTML("""
                     <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
                         <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
+                        <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Object Detection and Scene Understanding</h2>
+                        <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
                         <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
+                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
+                            <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
                         </div>
+                         <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
+                             <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
+                                 <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images may not be supported.
+                                 <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG</a> before uploading if needed.
+                             </p>
+                         </div>
                     </div>
                 """)
+        # Main Content with Tabs
+        with gr.Tabs(elem_classes="tabs"):
+            # Tab 1: Image Processing
+            with gr.Tab("Image Processing"):
+                current_image_model = gr.State("yolov8m.pt") # State for image model selection
+                with gr.Row(equal_height=False): # Allow columns to have different heights
+                    # Left Column: Image Input & Controls
+                    with gr.Column(scale=4, elem_classes="input-panel"):
+                        with gr.Group():
+                            gr.HTML('<div class="section-heading">Upload Image</div>')
+                            image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")
+                            with gr.Accordion("Image Analysis Settings", open=False):
+                                image_model_dropdown = gr.Dropdown(
+                                    choices=model_choices,
+                                    value="yolov8m.pt", # Default for images
+                                    label="Select Model",
+                                    info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
+                                )
+                                # Display model info
+                                image_model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))
+                                image_confidence = gr.Slider(
+                                    minimum=0.1, maximum=0.9, value=0.25, step=0.05,
+                                    label="Confidence Threshold",
+                                    info="Minimum confidence for displaying a detected object"
+                                )
+                                with gr.Accordion("Filter Classes", open=False):
+                                     gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
+                                     with gr.Row():
+                                         people_btn = gr.Button("People", size="sm")
+                                         vehicles_btn = gr.Button("Vehicles", size="sm")
+                                         animals_btn = gr.Button("Animals", size="sm")
+                                         objects_btn = gr.Button("Common Objects", size="sm")
+                                     image_class_filter = gr.Dropdown(
+                                         choices=class_choices_formatted, # Use formatted choices
+                                         multiselect=True,
+                                         label="Select Classes to Display",
+                                         info="Leave empty to show all detected objects"
+                                     )
+                        image_detect_btn = gr.Button("Analyze Image", variant="primary", elem_classes="detect-btn")
+                        with gr.Group(elem_classes="how-to-use"):
+                             gr.HTML('<div class="section-heading">How to Use (Image)</div>')
+                             gr.Markdown("""
+                                1. Upload an image or use the camera
+                                2. (Optional) Adjust settings like confidence threshold or model size (n, m, x)
+                                3. Optionally filter to specific object classes
+                                4. Click **Detect Objects** button
+                             """)
+                        # Image Examples
+                        gr.Examples(
+                            examples=[
+                                "room_01.jpg",
+                                "room_02.jpg",
+                                "street_02.jpg",
+                                "street_04.jpg"
+                                ],
+                            inputs=image_input,
+                            label="Example Images"
+                         )
+                    # Right Column: Image Results
+                    with gr.Column(scale=6, elem_classes="output-panel"):
+                        with gr.Tabs(elem_classes="tabs"):
+                            with gr.Tab("Detection Result"):
+                                image_result_image = gr.Image(type="pil", label="Detection Result")
+                                gr.HTML('<div class="section-heading">Detection Details</div>')
+                                image_result_text = gr.Textbox(label=None, lines=10, elem_id="detection-details", container=False)
+                            with gr.Tab("Scene Understanding"):
+                                gr.HTML('<div class="section-heading">Scene Analysis</div>')
+                                gr.HTML("""
+                                    <details class="info-details" style="margin: 5px 0 15px 0;">
+                                        <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
+                                            🔍 The AI Vision Scout Report: Click for important notes about this analysis
+                                        </summary>
+                                        <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
+                                            <p style="font-size: 13px; color: #718096; margin: 0;">
+                                                <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
+                                                Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
+                                                Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
+                                            </p>
+                                        </div>
+                                    </details>
+                                """)
+                                # Wrap HTML description for potential styling
+                                image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
+                                with gr.Row():
+                                     with gr.Column(scale=1):
+                                         gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
+                                         image_activities_list = gr.Dataframe(headers=["Activity"], datatype=["str"], row_count=5, col_count=1, wrap=True)
+                                     with gr.Column(scale=1):
+                                         gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
+                                         image_safety_list = gr.Dataframe(headers=["Concern"], datatype=["str"], row_count=5, col_count=1, wrap=True)
+                                gr.HTML('<div class="section-heading">Functional Zones</div>')
+                                image_zones_json = gr.JSON(label=None, elem_classes="json-box")
+                                gr.HTML('<div class="section-heading">Lighting Conditions</div>')
+                                image_lighting_info = gr.JSON(label=None, elem_classes="json-box")
+                            with gr.Tab("Statistics"):
+                                with gr.Row():
+                                    with gr.Column(scale=3, elem_classes="plot-column"):
+                                        gr.HTML('<div class="section-heading">Object Distribution</div>')
+                                        image_plot_output = gr.Plot(label=None, elem_classes="large-plot-container")
+                                    with gr.Column(scale=2, elem_classes="stats-column"):
+                                        gr.HTML('<div class="section-heading">Detection Statistics</div>')
+                                        image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
+            # Tab 2: Video Processing
+            with gr.Tab("Video Processing"):
+                with gr.Row(equal_height=False):
+                    # Left Column: Video Input & Controls
+                    with gr.Column(scale=4, elem_classes="input-panel"):
+                        with gr.Group():
+                            gr.HTML('<div class="section-heading">Video Input</div>')
+                            # Add input type selection
+                            video_input_type = gr.Radio(
+                                ["upload", "url"],
+                                label="Input Method",
+                                value="upload",
+                                info="Choose how to provide the video"
                             )
+                            # File upload (will be shown/hidden based on selection)
+                            with gr.Group(elem_id="upload-video-group"):
+                                video_input = gr.Video(
+                                    label="Upload a video file (MP4, AVI, MOV)",
+                                    sources=["upload"],
+                                    visible=True
+                                )
+                            # URL input (will be shown/hidden based on selection)
+                            with gr.Group(elem_id="url-video-group"):
+                                video_url_input = gr.Textbox(
+                                    label="Enter video URL (YouTube or direct video link)",
+                                    placeholder="https://www.youtube.com/watch?v=...",
+                                    visible=False,
+                                    elem_classes="custom-video-url-input"
+                                )
+                                gr.HTML("""
+                                    <div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
+                                        <p style="margin: 0; color: #4b5563;">
+                                            Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes.
                                         </p>
                                     </div>
+                                """)
+                            with gr.Accordion("Video Analysis Settings", open=True):
+                                video_model_dropdown = gr.Dropdown(
+                                    choices=model_choices,
+                                    value="yolov8n.pt", # Default 'n' for video
+                                    label="Select Model (Video)",
+                                    info="Faster models (like 'n') are recommended"
                                 )
+                                video_confidence = gr.Slider(
+                                    minimum=0.1, maximum=0.9, value=0.4, step=0.05,
+                                    label="Confidence Threshold (Video)"
                                 )
+                                video_process_interval = gr.Slider(
+                                    minimum=1, maximum=60, value=10, step=1, # Allow up to 60 frame interval
+                                    label="Processing Interval (Frames)",
+                                    info="Analyze every Nth frame (higher value = faster)"
+                                )
+                        video_process_btn = gr.Button("Process Video", variant="primary", elem_classes="detect-btn")
+                        with gr.Group(elem_classes="how-to-use"):
+                            gr.HTML('<div class="section-heading">How to Use (Video)</div>')
+                            gr.Markdown("""
+                            1. Choose your input method: Upload a file or enter a URL.
+                            2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
+                            3. Click "Process Video". **Processing can take a significant amount of time.**
+                            4. The annotated video and summary will appear on the right when finished.
+                            """)
+                        # Add video examples
+                        gr.HTML('<div class="section-heading">Example Videos</div>')
+                        gr.HTML("""
+                            <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
+                                <p style="font-size: 14px; color: #4A5568; margin: 0;">
+                                    Upload any video containing objects that YOLO can detect. For testing, find sample videos
+                                    <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
+                                </p>
+                            </div>
+                        """)
+                    # Right Column: Video Results
+                    with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
+                        gr.HTML("""
+                            <div class="section-heading">Video Result</div>
+                            <details class="info-details" style="margin: 5px 0 15px 0;">
+                                <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
+                                    🎬 Video Processing Notes
+                                </summary>
+                                <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
+                                    <p style="font-size: 13px; color: #718096; margin: 0;">
+                                        The processed video includes bounding boxes around detected objects. For longer videos,
+                                        consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
+                                    </p>
+                                </div>
+                            </details>
+                        """)
+                        video_output = gr.Video(label="Processed Video", elem_classes="video-output-container") # Output for the processed video file
+                        gr.HTML('<div class="section-heading">Processing Summary</div>')
+                        # 使用HTML顯示影片的摘要
+                        video_summary_text = gr.HTML(
+                            label=None,
+                            elem_id="video-summary-html-output"
+                        )
+                        gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
+                        video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
+        # Event Listeners
+        # Image Model Change Handler
+        image_model_dropdown.change(
+            fn=lambda model: (model, DetectionModel.get_model_description(model)),
+            inputs=[image_model_dropdown],
+            outputs=[current_image_model, image_model_info] # Update state and description
         )
+        # Image Filter Buttons
+        available_classes_list = get_all_classes() # Get list of (id, name)
+        people_classes_ids = [0]
+        vehicles_classes_ids = [1, 2, 3, 4, 5, 6, 7, 8]
+        animals_classes_ids = list(range(14, 24))
+        common_objects_ids = [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73] # Bottle, cup, fork, knife, spoon, bowl, chair, couch, table, tv, laptop, phone, book
+        people_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids], outputs=image_class_filter)
+        vehicles_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids], outputs=image_class_filter)
+        animals_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids], outputs=image_class_filter)
+        objects_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids], outputs=image_class_filter)
+        video_input_type.change(
+            fn=lambda input_type: [
+                # Show/hide file upload
+                gr.update(visible=(input_type == "upload")),
+                # Show/hide URL input
+                gr.update(visible=(input_type == "url"))
+            ],
+            inputs=[video_input_type],
+            outputs=[video_input, video_url_input]
         )
+        # Image Processing Button Click
+        image_detect_btn.click(
+            fn=handle_image_upload,
+            inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter],
+            outputs=[
+                image_result_image, image_result_text, image_stats_json, image_plot_output,
+                image_scene_description_html, image_activities_list, image_safety_list, image_zones_json,
+                image_lighting_info
+            ]
         )
+        video_process_btn.click(
+            fn=handle_video_upload,
+            inputs=[
+                video_input,
+                video_url_input,
+                video_input_type,
+                video_model_dropdown,
+                video_confidence,
+                video_process_interval
+            ],
+            outputs=[video_output, video_summary_text, video_stats_json]
         )
         # Footer
         gr.HTML("""
+             <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
+                 <div style="margin-bottom: 15px;">
+                     <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP and Ultralytics • Created with Gradio</p>
+                 </div>
+                 <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
+                     <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
+                     <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
+                         <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
+                     </a>
+                 </div>
+             </div>
+         """)
     return demo
 if __name__ == "__main__":
+    demo_interface = create_interface()
+    demo_interface.launch()

clip_analyzer.py CHANGED Viewed

@@ -3,6 +3,7 @@ import clip
 import numpy as np
 from PIL import Image
 from typing import Dict, List, Tuple, Any, Optional, Union
 from clip_prompts import (
     SCENE_TYPE_PROMPTS,
     CULTURAL_SCENE_PROMPTS,
@@ -24,7 +25,7 @@ class CLIPAnalyzer:
         初始化 CLIP 分析器。
         Args:
-            model_name: CLIP Model name,  "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
             device: Use GPU if it can use
         """
         # 自動選擇設備

 import numpy as np
 from PIL import Image
 from typing import Dict, List, Tuple, Any, Optional, Union
 from clip_prompts import (
     SCENE_TYPE_PROMPTS,
     CULTURAL_SCENE_PROMPTS,
         初始化 CLIP 分析器。
         Args:
+            model_name: CLIP Model name,  "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
             device: Use GPU if it can use
         """
         # 自動選擇設備

enhance_scene_describer.py CHANGED Viewed

@@ -126,7 +126,7 @@ class EnhancedSceneDescriber:
                 }
             }
-        # 文化模板
         if "cultural_templates" not in templates:
             templates["cultural_templates"] = {
                 "asian": {
@@ -164,8 +164,8 @@ class EnhancedSceneDescriber:
             "elevated_threshold": 0.6,  # Objects mostly in middle/bottom
             "elevated_top_threshold": 0.3  # Few objects at top of frame
         }
     def generate_description(self,
                         scene_type: str,
                         detected_objects: List[Dict],
@@ -190,26 +190,23 @@ class EnhancedSceneDescriber:
         """
         # Handle unknown scene type or very low confidence
         if scene_type == "unknown" or confidence < 0.4:
-            return self._generate_generic_description(detected_objects, lighting_info)
         # Detect viewpoint
-        viewpoint = self._detect_viewpoint(detected_objects)
         if viewpoint == "aerial":
-            # 如果是十字路口相關的場景，確保使用正確的空中視角十字路口場景類型
             if "intersection" in scene_type or self._is_intersection(detected_objects):
                 scene_type = "aerial_view_intersection"
-            # 如果是商業區相關的場景
             elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
                 scene_type = "aerial_view_commercial_area"
-            # 如果是廣場相關的場景
             elif any(keyword in scene_type for keyword in ["plaza", "square"]):
                 scene_type = "aerial_view_plaza"
-            # 其他空中視角場景，預設使用十字路口
             else:
                 scene_type = "aerial_view_intersection"
-        # Detect cultural context - 只有在非空中視角時才檢測文化上下文
         cultural_context = None
         if viewpoint != "aerial":
             cultural_context = self._detect_cultural_context(scene_type, detected_objects)
@@ -224,7 +221,6 @@ class EnhancedSceneDescriber:
         # Get base description for the scene type
         if viewpoint == "aerial":
-            # 空中視角時使用已設定的基本描述
             if 'base_description' not in locals():
                 base_description = "An aerial view showing the layout and movement patterns from above"
         elif scene_type in self.scene_types:
@@ -240,25 +236,38 @@ class EnhancedSceneDescriber:
             viewpoint
         )
-        # 修正：根據人數改進描述
-        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # 人
         if people_objs:
             people_count = len(people_objs)
             if people_count > 5:
-                # 當人數很多時，用更精確的措辭
                 people_phrase = f"numerous people ({people_count})"
             else:
                 people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
-            # 將人數信息加入到場景詳情中
-            if "people" not in scene_details.lower() and "pedestrian" not in scene_details.lower():
-                scene_details += f" The scene includes {people_phrase}."
-        # Apply cultural context if detected (只在非空中視角時應用)
-        if cultural_context and scene_details and viewpoint != "aerial":
             cultural_elements = self._generate_cultural_elements(cultural_context)
             if cultural_elements:
-                scene_details += f" {cultural_elements}"
         # Include lighting information if available
         lighting_description = ""
@@ -267,22 +276,25 @@ class EnhancedSceneDescriber:
             if lighting_type in self.templates.get("lighting_templates", {}):
                 lighting_description = self.templates["lighting_templates"][lighting_type]
-        # Apply confidence template
-        description_template = self.templates["confidence_templates"].get(
-            confidence_level, "{description} {details}"
-        )
-        # Fill the template
-        description = description_template.format(
-            description=base_description,
-            details=scene_details
-        )
-        # Add viewpoint observation if viewpoint is not standard
         if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
             viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
-            # 在空中視角時，確保觀察描述反映更多細節
             if viewpoint == "aerial":
                 scene_elements = "the crossing patterns and pedestrian movement"
             else:
@@ -292,93 +304,269 @@ class EnhancedSceneDescriber:
                 scene_elements=scene_elements
             )
-            # Add viewpoint prefix if needed
-            if not description.startswith(viewpoint_template.get("prefix", "")):
-                description = f"{viewpoint_template.get('prefix', '')}{description}"
             # Add viewpoint observation if not already included
-            if viewpoint_desc not in description:
-                description += f" {viewpoint_desc}"
-        # Add lighting description if available
-        if lighting_description and lighting_description not in description:
-            description += f" {lighting_description}"
         # Add information about functional zones if available
         if functional_zones and len(functional_zones) > 0:
             zones_desc = self._describe_functional_zones(functional_zones)
             if zones_desc:
-                description += f" {zones_desc}"
-        # 計算真實的人數
         people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
-        # 檢查描述中是否有人數信息的矛盾
         if people_count > 5:
-            # 識別可能含有較小人數信息的片段
             small_people_patterns = [
                 r"Area with \d+ people\.",
                 r"Area with \d+ person\.",
                 r"with \d+ people",
                 r"with \d+ person"
             ]
-            # 對每個模式檢查並移除
             filtered_description = description
             for pattern in small_people_patterns:
                 matches = re.findall(pattern, filtered_description)
                 for match in matches:
-                    # 從匹配中提取人數
                     number_match = re.search(r'\d+', match)
                     if number_match:
                         try:
                             people_mentioned = int(number_match.group())
-                            # 如果提到的人數小於總人數，移除整個句子
                             if people_mentioned < people_count:
-                                # 將描述分割成句子
                                 sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
-                                # 移除包含匹配片段的句子
                                 filtered_sentences = []
                                 for sentence in sentences:
                                     if match not in sentence:
                                         filtered_sentences.append(sentence)
-                                # 重新組合描述
                                 filtered_description = " ".join(filtered_sentences)
                         except ValueError:
-                            # 數字轉換失敗，繼續處理
                             continue
-            # 使用過濾後的描述
             description = filtered_description
         return description
     def _is_intersection(self, detected_objects: List[Dict]) -> bool:
         """
         通過分析物體分佈來判斷場景是否為十字路口
         """
         # 檢查行人分佈模式
         pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
         if len(pedestrians) >= 8:  # 需要足夠的行人來形成十字路口
             # 抓取行人位置
             positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
             # 分析 x 和 y 坐標分佈
             x_coords = [pos[0] for pos in positions]
             y_coords = [pos[1] for pos in positions]
             # 計算 x 和 y 坐標的變異數
             x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
             y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
             # 計算範圍
             x_range = max(x_coords) - min(x_coords)
             y_range = max(y_coords) - min(y_coords)
             # 如果 x 和 y 方向都有較大範圍且範圍相似，那就有可能是十字路口
             if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
                 return True
         return False
     def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
@@ -1165,27 +1353,27 @@ class EnhancedSceneDescriber:
         優化物品描述，避免重複列舉相同物品
         """
         import re
         # 處理床鋪重複描述
         if "bed in the room" in description:
             description = description.replace("a bed in the room", "a bed")
         # 處理重複的物品列表
         # 尋找格式如 "item, item, item" 的模式
         object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
         for obj_list in object_lists:
             # 計算每個物品出現次數
             items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
             item_counts = {}
             for item in items:
                 item = item.strip()
                 if item and item not in ["and", "with"]:
                     if item not in item_counts:
                         item_counts[item] = 0
                     item_counts[item] += 1
             # 生成優化後的物品列表
             if item_counts:
                 new_items = []
@@ -1194,7 +1382,7 @@ class EnhancedSceneDescriber:
                         new_items.append(f"{count} {item}s")
                     else:
                         new_items.append(item)
                 # 格式化新列表
                 if len(new_items) == 1:
                     new_list = new_items[0]
@@ -1202,10 +1390,10 @@ class EnhancedSceneDescriber:
                     new_list = f"{new_items[0]} and {new_items[1]}"
                 else:
                     new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
                 # 替換原始列表
                 description = description.replace(obj_list, new_list)
         return description
     def _describe_functional_zones(self, functional_zones: Dict) -> str:
@@ -1288,7 +1476,7 @@ class EnhancedSceneDescriber:
         # 根據處理後的區域數量生成最終描述
         final_desc = ""
         if len(processed_zones) == 1:
             _, zone_info = processed_zones[0]
             zone_desc = zone_info["description"]

                 }
             }
+        # 文化模板
         if "cultural_templates" not in templates:
             templates["cultural_templates"] = {
                 "asian": {
             "elevated_threshold": 0.6,  # Objects mostly in middle/bottom
             "elevated_top_threshold": 0.3  # Few objects at top of frame
         }
     def generate_description(self,
                         scene_type: str,
                         detected_objects: List[Dict],
         """
         # Handle unknown scene type or very low confidence
         if scene_type == "unknown" or confidence < 0.4:
+            return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
         # Detect viewpoint
+        viewpoint = self._detect_viewpoint(detected_objects)
+        # Process aerial viewpoint scene types
         if viewpoint == "aerial":
             if "intersection" in scene_type or self._is_intersection(detected_objects):
                 scene_type = "aerial_view_intersection"
             elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
                 scene_type = "aerial_view_commercial_area"
             elif any(keyword in scene_type for keyword in ["plaza", "square"]):
                 scene_type = "aerial_view_plaza"
             else:
                 scene_type = "aerial_view_intersection"
+        # Detect cultural context - only for non-aerial viewpoints
         cultural_context = None
         if viewpoint != "aerial":
             cultural_context = self._detect_cultural_context(scene_type, detected_objects)
         # Get base description for the scene type
         if viewpoint == "aerial":
             if 'base_description' not in locals():
                 base_description = "An aerial view showing the layout and movement patterns from above"
         elif scene_type in self.scene_types:
             viewpoint
         )
+        # Start with the base description
+        description = base_description
+        # If there's a secondary description from the scene type template, append it properly
+        if scene_type in self.scene_types and "secondary_description" in self.scene_types[scene_type]:
+            secondary_desc = self.scene_types[scene_type]["secondary_description"]
+            if secondary_desc:
+                description = self._smart_append(description, secondary_desc)
+        # Improve description based on people count
+        people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]  # Person class
         if people_objs:
             people_count = len(people_objs)
             if people_count > 5:
                 people_phrase = f"numerous people ({people_count})"
             else:
                 people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
+            # Add people information to the scene details if not already mentioned
+            if "people" not in description.lower() and "pedestrian" not in description.lower():
+                description = self._smart_append(description, f"The scene includes {people_phrase}")
+        # Apply cultural context if detected (only for non-aerial viewpoints)
+        if cultural_context and viewpoint != "aerial":
             cultural_elements = self._generate_cultural_elements(cultural_context)
             if cultural_elements:
+                description = self._smart_append(description, cultural_elements)
+        # Now append the detailed scene information if available
+        if scene_details:
+            # Use smart_append to ensure proper formatting between base description and details
+            description = self._smart_append(description, scene_details)
         # Include lighting information if available
         lighting_description = ""
             if lighting_type in self.templates.get("lighting_templates", {}):
                 lighting_description = self.templates["lighting_templates"][lighting_type]
+        # Add lighting description if available
+        if lighting_description and lighting_description not in description:
+            description = self._smart_append(description, lighting_description)
+        # Process viewpoint information
         if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
             viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
+            # Special handling for viewpoint prefix
+            prefix = viewpoint_template.get('prefix', '')
+            if prefix and not description.startswith(prefix):
+                # Prefix is a phrase like "From above, " that should precede the description
+                if description and description[0].isupper():
+                    # Maintain the flow by lowercasing the first letter after the prefix
+                    description = prefix + description[0].lower() + description[1:]
+                else:
+                    description = prefix + description
+            # Get appropriate scene elements description based on viewpoint
             if viewpoint == "aerial":
                 scene_elements = "the crossing patterns and pedestrian movement"
             else:
                 scene_elements=scene_elements
             )
             # Add viewpoint observation if not already included
+            if viewpoint_desc and viewpoint_desc not in description:
+                description = self._smart_append(description, viewpoint_desc)
         # Add information about functional zones if available
         if functional_zones and len(functional_zones) > 0:
             zones_desc = self._describe_functional_zones(functional_zones)
             if zones_desc:
+                description = self._smart_append(description, zones_desc)
+        # Calculate actual people count
         people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
+        # Check for inconsistencies in people count descriptions
         if people_count > 5:
+            # Identify fragments that might contain smaller people counts
             small_people_patterns = [
                 r"Area with \d+ people\.",
                 r"Area with \d+ person\.",
                 r"with \d+ people",
                 r"with \d+ person"
             ]
+            # Check and remove each pattern
             filtered_description = description
             for pattern in small_people_patterns:
                 matches = re.findall(pattern, filtered_description)
                 for match in matches:
+                    # Extract the number from the match
                     number_match = re.search(r'\d+', match)
                     if number_match:
                         try:
                             people_mentioned = int(number_match.group())
+                            # If the mentioned count is less than total, remove the entire sentence
                             if people_mentioned < people_count:
+                                # Split description into sentences
                                 sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
+                                # Remove sentences containing the match
                                 filtered_sentences = []
                                 for sentence in sentences:
                                     if match not in sentence:
                                         filtered_sentences.append(sentence)
+                                # Recombine the description
                                 filtered_description = " ".join(filtered_sentences)
                         except ValueError:
+                            # Failed number conversion, continue processing
                             continue
+            # Use the filtered description
             description = filtered_description
+        # Final formatting to ensure correct punctuation and capitalization
+        description = self._format_final_description(description)
         return description
+    def _smart_append(self, current_text: str, new_fragment: str) -> str:
+        """
+        Intelligently append a new text fragment to the current text,
+        handling punctuation and capitalization correctly.
+        Args:
+            current_text: The existing text to append to
+            new_fragment: The new text fragment to append
+        Returns:
+            str: The combined text with proper formatting
+        """
+        # Handle empty cases
+        if not new_fragment:
+            return current_text
+        if not current_text:
+            # Ensure first character is uppercase for the first fragment
+            return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
+        # Clean up existing text
+        current_text = current_text.rstrip()
+        # Check for ending punctuation
+        ends_with_sentence = current_text.endswith(('.', '!', '?'))
+        ends_with_comma = current_text.endswith(',')
+        # Specifically handle the "A xxx A yyy" pattern that's causing issues
+        if (current_text.startswith("A ") or current_text.startswith("An ")) and \
+        (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
+            return current_text + ". " + new_fragment
+        # Decide how to join the texts
+        if ends_with_sentence:
+            # After a sentence, start with uppercase and add proper spacing
+            joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
+        elif ends_with_comma:
+            # After a comma, maintain flow with lowercase unless it's a proper noun or special case
+            if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
+                joined_text = current_text + " " + new_fragment
+            else:
+                joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
+        elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower():
+            # When adding a new sentence about the scene, use a period
+            joined_text = current_text + ". " + new_fragment
+        else:
+            # For other cases, decide based on the content
+            if self._is_related_phrases(current_text, new_fragment):
+                if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
+                    joined_text = current_text + ", " + new_fragment
+                else:
+                    joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
+            else:
+                # Use period for unrelated phrases
+                joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
+        return joined_text
+    def _is_related_phrases(self, text1: str, text2: str) -> bool:
+        """
+        Determine if two phrases are related and should be connected with a comma
+        rather than separated with a period.
+        Args:
+            text1: The first text fragment
+            text2: The second text fragment to be appended
+        Returns:
+            bool: Whether the phrases appear to be related
+        """
+        # Check if either phrase starts with "A" or "An" - these are likely separate descriptions
+        if (text1.startswith("A ") or text1.startswith("An ")) and \
+        (text2.startswith("A ") or text2.startswith("An ")):
+            return False  # These are separate descriptions, not related phrases
+        # Check if the second phrase starts with a connecting word
+        connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
+                        "this", "these", "that", "those", "and", "or", "but"]
+        first_word = text2.split()[0].lower() if text2 else ""
+        if first_word in connecting_words:
+            return True
+        # Check if the first phrase ends with something that suggests continuity
+        ending_patterns = ["such as", "including", "like", "especially", "particularly",
+                        "for example", "for instance", "namely", "specifically"]
+        for pattern in ending_patterns:
+            if text1.lower().endswith(pattern):
+                return True
+        # Check if both phrases are about the scene
+        if "scene" in text1.lower() and "scene" in text2.lower():
+            return False  # Separate statements about the scene should be separate sentences
+        return False
+    def _format_final_description(self, text: str) -> str:
+        """
+        Format the final description text to ensure correct punctuation,
+        capitalization, and spacing.
+        Args:
+            text: The text to format
+        Returns:
+            str: The properly formatted text
+        """
+        import re
+        if not text:
+            return ""
+        # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
+        text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
+        text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
+        # 2. 確保第一個字母大寫
+        text = text[0].upper() + text[1:] if text else ""
+        # 3. 修正詞之間的空格問題
+        text = re.sub(r'\s{2,}', ' ', text)  # 多個空格改為一個
+        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # 小寫後大寫間加空格
+        # 4. 修正詞連接問題
+        text = re.sub(r'([a-zA-Z])and', r'\1 and', text)  # "xxx"和"and"間加空格
+        text = re.sub(r'([a-zA-Z])with', r'\1 with', text)  # "xxx"和"with"間加空格
+        text = re.sub(r'plants(and|with|or)', r'plants \1', text)  # 修正"plantsand"這類問題
+        # 5. 修正標點符號後的大小寫問題
+        text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text)  # 句號後大寫
+        # 6. 修正逗號後接大寫單詞的問題
+        def fix_capitalization_after_comma(match):
+            word = match.group(2)
+            # 例外情況：保留專有名詞、人稱代詞等的大寫
+            if word in ["I", "I'm", "I've", "I'd", "I'll"]:
+                return match.group(0)  # 保持原樣
+            # 保留月份、星期、地名等專有名詞的大寫
+            proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
+                            "August", "September", "October", "November", "December",
+                            "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
+            if word in proper_nouns:
+                return match.group(0)  # 保持原樣
+            # 其他情況：將首字母改為小寫
+            return match.group(1) + word[0].lower() + word[1:]
+        # 匹配逗號後接空格再接大寫單詞的模式
+        text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
+        common_phrases = [
+            (r'Social or seating area', r'social or seating area'),
+            (r'Sleeping area', r'sleeping area'),
+            (r'Dining area', r'dining area'),
+            (r'Living space', r'living space')
+        ]
+        for phrase, replacement in common_phrases:
+            # 只修改句中的術語，保留句首的大寫
+            text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
+            # 修改句中的術語，但保留句首的大寫
+            text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
+        # 7. 確保標點符號後有空格
+        text = re.sub(r'\s+([.,;:!?])', r'\1', text)  # 標點符號前不要空格
+        text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text)  # 標點符號後要有空格
+        # 8. 修正重複標點符號
+        text = re.sub(r'\.{2,}', '.', text)  # 多個句號變一個
+        text = re.sub(r',{2,}', ',', text)  # 多個逗號變一個
+        # 9. 確保文本以標點結束
+        if text and not text[-1] in '.!?':
+            text += '.'
+        return text
     def _is_intersection(self, detected_objects: List[Dict]) -> bool:
         """
         通過分析物體分佈來判斷場景是否為十字路口
         """
         # 檢查行人分佈模式
         pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
         if len(pedestrians) >= 8:  # 需要足夠的行人來形成十字路口
             # 抓取行人位置
             positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
             # 分析 x 和 y 坐標分佈
             x_coords = [pos[0] for pos in positions]
             y_coords = [pos[1] for pos in positions]
             # 計算 x 和 y 坐標的變異數
             x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
             y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
             # 計算範圍
             x_range = max(x_coords) - min(x_coords)
             y_range = max(y_coords) - min(y_coords)
             # 如果 x 和 y 方向都有較大範圍且範圍相似，那就有可能是十字路口
             if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
                 return True
         return False
     def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
         優化物品描述，避免重複列舉相同物品
         """
         import re
         # 處理床鋪重複描述
         if "bed in the room" in description:
             description = description.replace("a bed in the room", "a bed")
         # 處理重複的物品列表
         # 尋找格式如 "item, item, item" 的模式
         object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
         for obj_list in object_lists:
             # 計算每個物品出現次數
             items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
             item_counts = {}
             for item in items:
                 item = item.strip()
                 if item and item not in ["and", "with"]:
                     if item not in item_counts:
                         item_counts[item] = 0
                     item_counts[item] += 1
             # 生成優化後的物品列表
             if item_counts:
                 new_items = []
                         new_items.append(f"{count} {item}s")
                     else:
                         new_items.append(item)
                 # 格式化新列表
                 if len(new_items) == 1:
                     new_list = new_items[0]
                     new_list = f"{new_items[0]} and {new_items[1]}"
                 else:
                     new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
                 # 替換原始列表
                 description = description.replace(obj_list, new_list)
         return description
     def _describe_functional_zones(self, functional_zones: Dict) -> str:
         # 根據處理後的區域數量生成最終描述
         final_desc = ""
         if len(processed_zones) == 1:
             _, zone_info = processed_zones[0]
             zone_desc = zone_info["description"]

lighting_analyzer.py CHANGED Viewed

@@ -151,11 +151,11 @@ class LightingAnalyzer:
         avg_saturation = np.mean(s_channel)
-        # 天空亮度
         upper_half = v_channel[:height//2, :]
         sky_brightness = np.mean(upper_half)
-        # 色調分析
         warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
         warm_ratio = np.sum(warm_colors) / (height * width)
@@ -186,16 +186,16 @@ class LightingAnalyzer:
         top_scale = scale_factor * 2  # 更積極的下採樣
         top_region = v_channel[:height//4:top_scale, ::top_scale]
         top_region_std = np.std(top_region)
-        ceiling_uniformity = 1.0 - min(1.0, top_region_std / max(np.mean(top_region), 1e-5))
         # 使用更簡單的方法檢測上部水平線
         top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
         horizontal_lines_strength = np.mean(top_gradients)
         # 標準化
-        horizontal_line_ratio = min(1.0, horizontal_lines_strength / 40)
         # 極簡的亮點檢測
-        sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
         light_threshold = min(220, avg_brightness + 2*brightness_std)
         is_bright = sampled_v > light_threshold
         bright_spot_count = np.sum(is_bright)
@@ -203,7 +203,7 @@ class LightingAnalyzer:
         # 圓形光源分析的簡化替代方法
         circular_light_score = 0
         indoor_light_score = 0
-        light_distribution_uniformity = 0.5
         # 只有當檢測到亮點，且不是大量亮點時（可能是室外光反射）才進行光源分析
         if 1 < bright_spot_count < 20:
@@ -227,7 +227,7 @@ class LightingAnalyzer:
                     indoor_light_score = 0.3
         # 使用邊緣區域梯度來快速估計邊界
-        edge_scale = scale_factor * 2
         # 只採樣圖像邊緣部分進行分析
         left_edge = small_gray[:, :small_gray.shape[1]//6]
@@ -240,15 +240,15 @@ class LightingAnalyzer:
         top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
         # 標準化
-        left_edge_density = min(1.0, left_gradient / 50.0)
-        right_edge_density = min(1.0, right_gradient / 50.0)
-        top_edge_density = min(1.0, top_gradient / 50.0)
         # 封閉環境通常在圖像邊緣有較強的梯度
         boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
         # 簡單估計整體邊緣密度
-        edges_density = min(1.0, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100.0)
         street_line_score = 0
@@ -319,16 +319,16 @@ class LightingAnalyzer:
         # 1. 藍色區域（天空）特徵 - 藍色區域多通常表示室外
         if features.get("blue_ratio", 0) > 0.2:
             # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
-            if (features.get("ceiling_uniformity", 0) > 0.5 or
-                features.get("boundary_edge_score", 0) > 0.3 or
                 features.get("indoor_light_score", 0) > 0.2 or
                 features.get("bright_spot_count", 0) > 0):
-                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
             else:
                 blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
         else:
             blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
         indoor_score += blue_score
         feature_contributions["blue_ratio"] = blue_score
@@ -351,14 +351,14 @@ class LightingAnalyzer:
             horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
             # 增強天花板檢測的影響
-            if ceiling_uniformity > 0.5:
-                ceiling_weight = 3
-                ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
                 if horizontal_line_ratio > 0.2:  # 如果有水平線條，進一步增強
-                    ceiling_contribution *= 1.5
-            elif ceiling_uniformity > 0.4:
-                ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
             indoor_score += ceiling_contribution
             feature_contributions["ceiling_features"] = ceiling_contribution
@@ -370,7 +370,7 @@ class LightingAnalyzer:
             # 加強對特定類型光源的檢測
             if circular_light_count >= 1:  # 即便只有一個圓形光源也很可能是室內
-                light_contribution = weights.get("light_features", 1.2) * 2.0
             elif indoor_light_score > 0.3:
                 light_contribution = weights.get("light_features", 1.2) * 1.0
@@ -384,11 +384,11 @@ class LightingAnalyzer:
             edges_density = features.get("edges_density", 0)
             # 高邊界評分暗示封閉環境（室內）
-            if boundary_edge_score > 0.3:
-                boundary_contribution = weights.get("boundary_features", 1.2) * 2
-            elif boundary_edge_score > 0.2:
-                boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
             indoor_score += boundary_contribution
             feature_contributions["boundary_features"] = boundary_contribution
@@ -415,7 +415,7 @@ class LightingAnalyzer:
             combined_uniformity = (features["brightness_uniformity"] +
                                 features.get("ceiling_uniformity", 0)) / 2
-            if combined_uniformity > 0.5:
                 gradient_contribution = weights["gradient_ratio"] * 0.7
             else:
                 gradient_contribution = -weights["gradient_ratio"] * 0.3
@@ -430,7 +430,7 @@ class LightingAnalyzer:
         # 調整亮點分析邏輯
         if circular_light_count >= 1:  # 即使只有一個圓形光源
-            bright_spot_contribution = weights["bright_spots"] * 1.5
         elif bright_spot_count < 5:  # 適當放寬閾值
             bright_spot_contribution = weights["bright_spots"] * 0.5
         elif bright_spot_count > 15:  # 大量亮點比較有可能為室外
@@ -441,8 +441,8 @@ class LightingAnalyzer:
         # 8. 色調分析
         yellow_contribution = 0
-        if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
-            if features.get("indoor_light_score", 0) > 0.2:
                 yellow_contribution = weights["color_tone"] * 0.8
             else:
                 yellow_contribution = weights["color_tone"] * 0.5
@@ -452,10 +452,10 @@ class LightingAnalyzer:
         if features.get("blue_ratio", 0) > 0.7:
             # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
-            if (features.get("ceiling_uniformity", 0) > 0.6 or
-                features.get("boundary_edge_score", 0) > 0.3 or
                 features.get("indoor_light_score", 0) > 0):
-                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
             else:
                 blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
         else:
@@ -534,19 +534,19 @@ class LightingAnalyzer:
         # 1: 窗戶和牆壁形成的直角
         if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
             bedroom_indicators += 1.5  # 增加權重
         # 2: 天花板和光源
         if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
-            bedroom_indicators += 2.5
         # 3: 良好對比度的牆壁顏色，適合臥房還有客廳
         if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
-            bedroom_indicators += 1.5
         # 特殊的檢測 4: 檢測窗戶
         if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
-            bedroom_indicators += 1.5
         # 如果滿足足夠的家居指標，提高多點室內判斷分數
         if bedroom_indicators >= 3:
             # 增加��居環境評分
@@ -576,11 +576,11 @@ class LightingAnalyzer:
     def _determine_lighting_conditions(self, features, is_indoor):
         """
         基於特徵和室內/室外判斷確定光照條件。
         Args:
             features: 特徵字典
             is_indoor: 是否是室內環境
         Returns:
             Dict: 光照條件分析結果
         """
@@ -588,37 +588,37 @@ class LightingAnalyzer:
         time_of_day = "unknown"
         confidence = 0.5
         diagnostics = {}
         avg_brightness = features["avg_brightness"]
         dark_pixel_ratio = features["dark_pixel_ratio"]
         yellow_orange_ratio = features["yellow_orange_ratio"]
         blue_ratio = features["blue_ratio"]
         gray_ratio = features["gray_ratio"]
         # 基於室內/室外分別判斷
         if is_indoor:
             # 計算室內住宅自然光指標
             natural_window_light = 0
             # 檢查窗戶特徵和光線特性
-            if (features.get("blue_ratio", 0) > 0.1 and
                 features.get("sky_brightness", 0) > avg_brightness * 1.1):
                 natural_window_light += 1
             # 檢查均勻柔和的光線分布
-            if (features.get("brightness_uniformity", 0) > 0.65 and
                 features.get("brightness_std", 0) < 70):
                 natural_window_light += 1
             # 檢查暖色調比例
             if features.get("warm_ratio", 0) > 0.2:
                 natural_window_light += 1
             # 家居環境指標
             home_env_score = features.get("home_environment_pattern", 0)
             if home_env_score > 1.5:
                 natural_window_light += 1
             # 1. 室內明亮環境，可能有窗戶自然光
             if avg_brightness > 130:
                 # 檢測自然光住宅空間 - 新增類型!
@@ -645,7 +645,7 @@ class LightingAnalyzer:
                 time_of_day = "indoor_dim"
                 confidence = 0.65 + dark_pixel_ratio / 3
                 diagnostics["reason"] = "Low brightness in indoor environment"
             # 1. 檢測設計師風格住宅，可以偵測到比較多種類的狀況
             designer_residential_score = 0
             # 檢測特色燈具
@@ -660,19 +660,19 @@ class LightingAnalyzer:
             # 檢測家居環境特徵
             if home_env_score > 1.5:
                 designer_residential_score += 1
             if designer_residential_score >= 3 and home_env_score > 1.5:
-                time_of_day = "indoor_designer_residential"
                 confidence = 0.85
                 diagnostics["special_case"] = "Designer residential lighting with decorative elements"
             # 2. 檢測餐廳/酒吧場景
             elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
                 if features["warm_ratio"] > 0.4:
                     time_of_day = "indoor_restaurant"
                     confidence = 0.65 + yellow_orange_ratio / 4
                     diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
             # 3. 檢測商業照明空間
             elif avg_brightness > 120 and features["bright_spot_count"] > 4:
                 # 增加商業照明判別的精確度
@@ -685,7 +685,7 @@ class LightingAnalyzer:
                 # 整體照明結構化布局
                 if features.get("light_distribution_uniformity", 0) > 0.6:
                     commercial_score += 0.5
                 if commercial_score > 0.6 and designer_residential_score < 3:
                     time_of_day = "indoor_commercial"
                     confidence = 0.7 + commercial_score / 5
@@ -794,18 +794,18 @@ class LightingAnalyzer:
         """
         return {
             "indoor_outdoor_weights": {
-                "blue_ratio": 0.6,
-                "brightness_uniformity": 1.2,
-                "gradient_ratio": 0.7,
-                "bright_spots": 0.8,
-                "color_tone": 0.5,
-                "sky_brightness": 0.9,
-                "brightness_variation": 0.7,
-                "ceiling_features": 1.5,
-                "light_features": 1.1,
-                "boundary_features": 2.8,
-                "street_features": 2.0,
-                "building_features": 1.6
             },
             "include_diagnostics": True
         }

         avg_saturation = np.mean(s_channel)
+        # 天空亮度
         upper_half = v_channel[:height//2, :]
         sky_brightness = np.mean(upper_half)
+        # 色調分析
         warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
         warm_ratio = np.sum(warm_colors) / (height * width)
         top_scale = scale_factor * 2  # 更積極的下採樣
         top_region = v_channel[:height//4:top_scale, ::top_scale]
         top_region_std = np.std(top_region)
+        ceiling_uniformity = 1.0 - min(1, top_region_std / max(np.mean(top_region), 1e-5))
         # 使用更簡單的方法檢測上部水平線
         top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
         horizontal_lines_strength = np.mean(top_gradients)
         # 標準化
+        horizontal_line_ratio = min(1, horizontal_lines_strength / 40)
         # 極簡的亮點檢測
+        sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
         light_threshold = min(220, avg_brightness + 2*brightness_std)
         is_bright = sampled_v > light_threshold
         bright_spot_count = np.sum(is_bright)
         # 圓形光源分析的簡化替代方法
         circular_light_score = 0
         indoor_light_score = 0
+        light_distribution_uniformity = 0.5
         # 只有當檢測到亮點，且不是大量亮點時（可能是室外光反射）才進行光源分析
         if 1 < bright_spot_count < 20:
                     indoor_light_score = 0.3
         # 使用邊緣區域梯度來快速估計邊界
+        edge_scale = scale_factor * 2
         # 只採樣圖像邊緣部分進行分析
         left_edge = small_gray[:, :small_gray.shape[1]//6]
         top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
         # 標準化
+        left_edge_density = min(1.0, left_gradient / 50)
+        right_edge_density = min(1.0, right_gradient / 50)
+        top_edge_density = min(1.0, top_gradient / 50)
         # 封閉環境通常在圖像邊緣有較強的梯度
         boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
         # 簡單估計整體邊緣密度
+        edges_density = min(1, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100)
         street_line_score = 0
         # 1. 藍色區域（天空）特徵 - 藍色區域多通常表示室外
         if features.get("blue_ratio", 0) > 0.2:
             # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
+            if (features.get("ceiling_uniformity", 0) > 0.5 or
+                features.get("boundary_edge_score", 0) > 0.3 or
                 features.get("indoor_light_score", 0) > 0.2 or
                 features.get("bright_spot_count", 0) > 0):
+                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
             else:
                 blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
         else:
             blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
         indoor_score += blue_score
         feature_contributions["blue_ratio"] = blue_score
             horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
             # 增強天花板檢測的影響
+            if ceiling_uniformity > 0.5:
+                ceiling_weight = 3
+                ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
                 if horizontal_line_ratio > 0.2:  # 如果有水平線條，進一步增強
+                    ceiling_contribution *= 1.5
+            elif ceiling_uniformity > 0.4:
+                ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
             indoor_score += ceiling_contribution
             feature_contributions["ceiling_features"] = ceiling_contribution
             # 加強對特定類型光源的檢測
             if circular_light_count >= 1:  # 即便只有一個圓形光源也很可能是室內
+                light_contribution = weights.get("light_features", 1.2) * 2.0
             elif indoor_light_score > 0.3:
                 light_contribution = weights.get("light_features", 1.2) * 1.0
             edges_density = features.get("edges_density", 0)
             # 高邊界評分暗示封閉環境（室內）
+            if boundary_edge_score > 0.3:
+                boundary_contribution = weights.get("boundary_features", 1.2) * 2
+            elif boundary_edge_score > 0.2:
+                boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
             indoor_score += boundary_contribution
             feature_contributions["boundary_features"] = boundary_contribution
             combined_uniformity = (features["brightness_uniformity"] +
                                 features.get("ceiling_uniformity", 0)) / 2
+            if combined_uniformity > 0.5:
                 gradient_contribution = weights["gradient_ratio"] * 0.7
             else:
                 gradient_contribution = -weights["gradient_ratio"] * 0.3
         # 調整亮點分析邏輯
         if circular_light_count >= 1:  # 即使只有一個圓形光源
+            bright_spot_contribution = weights["bright_spots"] * 1.5
         elif bright_spot_count < 5:  # 適當放寬閾值
             bright_spot_contribution = weights["bright_spots"] * 0.5
         elif bright_spot_count > 15:  # 大量亮點比較有可能為室外
         # 8. 色調分析
         yellow_contribution = 0
+        if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
+            if features.get("indoor_light_score", 0) > 0.2:
                 yellow_contribution = weights["color_tone"] * 0.8
             else:
                 yellow_contribution = weights["color_tone"] * 0.5
         if features.get("blue_ratio", 0) > 0.7:
             # 檢查是否有室內指標，如果有明顯的室內特徵，則減少藍色的負面影響
+            if (features.get("ceiling_uniformity", 0) > 0.6 or
+                features.get("boundary_edge_score", 0) > 0.3 or
                 features.get("indoor_light_score", 0) > 0):
+                blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
             else:
                 blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
         else:
         # 1: 窗戶和牆壁形成的直角
         if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
             bedroom_indicators += 1.5  # 增加權重
         # 2: 天花板和光源
         if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
+            bedroom_indicators += 2.5
         # 3: 良好對比度的牆壁顏色，適合臥房還有客廳
         if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
+            bedroom_indicators += 1.5
         # 特殊的檢測 4: 檢測窗戶
         if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
+            bedroom_indicators += 1.5
         # 如果滿足足夠的家居指標，提高多點室內判斷分數
         if bedroom_indicators >= 3:
             # 增加��居環境評分
     def _determine_lighting_conditions(self, features, is_indoor):
         """
         基於特徵和室內/室外判斷確定光照條件。
         Args:
             features: 特徵字典
             is_indoor: 是否是室內環境
         Returns:
             Dict: 光照條件分析結果
         """
         time_of_day = "unknown"
         confidence = 0.5
         diagnostics = {}
         avg_brightness = features["avg_brightness"]
         dark_pixel_ratio = features["dark_pixel_ratio"]
         yellow_orange_ratio = features["yellow_orange_ratio"]
         blue_ratio = features["blue_ratio"]
         gray_ratio = features["gray_ratio"]
         # 基於室內/室外分別判斷
         if is_indoor:
             # 計算室內住宅自然光指標
             natural_window_light = 0
             # 檢查窗戶特徵和光線特性
+            if (features.get("blue_ratio", 0) > 0.1 and
                 features.get("sky_brightness", 0) > avg_brightness * 1.1):
                 natural_window_light += 1
             # 檢查均勻柔和的光線分布
+            if (features.get("brightness_uniformity", 0) > 0.65 and
                 features.get("brightness_std", 0) < 70):
                 natural_window_light += 1
             # 檢查暖色調比例
             if features.get("warm_ratio", 0) > 0.2:
                 natural_window_light += 1
             # 家居環境指標
             home_env_score = features.get("home_environment_pattern", 0)
             if home_env_score > 1.5:
                 natural_window_light += 1
             # 1. 室內明亮環境，可能有窗戶自然光
             if avg_brightness > 130:
                 # 檢測自然光住宅空間 - 新增類型!
                 time_of_day = "indoor_dim"
                 confidence = 0.65 + dark_pixel_ratio / 3
                 diagnostics["reason"] = "Low brightness in indoor environment"
             # 1. 檢測設計師風格住宅，可以偵測到比較多種類的狀況
             designer_residential_score = 0
             # 檢測特色燈具
             # 檢測家居環境特徵
             if home_env_score > 1.5:
                 designer_residential_score += 1
             if designer_residential_score >= 3 and home_env_score > 1.5:
+                time_of_day = "indoor_designer_residential"
                 confidence = 0.85
                 diagnostics["special_case"] = "Designer residential lighting with decorative elements"
             # 2. 檢測餐廳/酒吧場景
             elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
                 if features["warm_ratio"] > 0.4:
                     time_of_day = "indoor_restaurant"
                     confidence = 0.65 + yellow_orange_ratio / 4
                     diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
             # 3. 檢測商業照明空間
             elif avg_brightness > 120 and features["bright_spot_count"] > 4:
                 # 增加商業照明判別的精確度
                 # 整體照明結構化布局
                 if features.get("light_distribution_uniformity", 0) > 0.6:
                     commercial_score += 0.5
                 if commercial_score > 0.6 and designer_residential_score < 3:
                     time_of_day = "indoor_commercial"
                     confidence = 0.7 + commercial_score / 5
         """
         return {
             "indoor_outdoor_weights": {
+                "blue_ratio": 0.6,
+                "brightness_uniformity": 1.2,
+                "gradient_ratio": 0.7,
+                "bright_spots": 0.8,
+                "color_tone": 0.5,
+                "sky_brightness": 0.9,
+                "brightness_variation": 0.7,
+                "ceiling_features": 1.5,
+                "light_features": 1.1,
+                "boundary_features": 2.8,
+                "street_features": 2,
+                "building_features": 1.6
             },
             "include_diagnostics": True
         }

requirements.txt CHANGED Viewed

@@ -7,3 +7,5 @@ numpy>=1.23.5
 matplotlib>=3.7.0
 gradio>=3.32.0
 git+https://github.com/openai/CLIP.git

 matplotlib>=3.7.0
 gradio>=3.32.0
 git+https://github.com/openai/CLIP.git
+yt-dlp>=2023.3.4
+requests>=2.28.1

scene_analyzer.py CHANGED Viewed

@@ -17,7 +17,6 @@ class SceneAnalyzer:
     def __init__(self, class_names: Dict[int, str] = None):
         """
         Initialize the scene analyzer with optional class name mappings.
         Args:
             class_names: Dictionary mapping class IDs to class names (optional)
         """
@@ -49,14 +48,12 @@ class SceneAnalyzer:
                              functional_zones=None):
         """
         生成場景描述。
         Args:
             scene_type: 識別的場景類型
             detected_objects: 檢測到的物體列表
             confidence: 場景分類置信度
             lighting_info: 照明條件信息（可選）
             functional_zones: 功能區域信息（可選）
         Returns:
             str: 生成的場景描述
         """
@@ -101,13 +98,11 @@ class SceneAnalyzer:
     def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
         """
         Analyze detection results to determine scene type and provide understanding.
         Args:
             detection_result: Detection result from YOLOv8
             lighting_info: Optional lighting condition analysis results
             class_confidence_threshold: Minimum confidence to consider an object
             scene_confidence_threshold: Minimum confidence to determine a scene
         Returns:
             Dictionary with scene analysis results
         """
@@ -141,7 +136,7 @@ class SceneAnalyzer:
         if not detected_objects:
             return {
                 "scene_type": "unknown",
-                "confidence": 0.0,
                 "description": "No objects with sufficient confidence detected.",
                 "objects_present": [],
                 "object_count": 0,
@@ -265,10 +260,8 @@ class SceneAnalyzer:
     def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
         """
         Compute confidence scores for each scene type based on detected objects.
         Args:
             detected_objects: List of detected objects
         Returns:
             Dictionary mapping scene types to confidence scores
         """
@@ -308,7 +301,7 @@ class SceneAnalyzer:
             optional_score = optional_ratio * 0.3  # 30% of score from optional objects
             # Bonus for having multiple instances of key objects
-            multiple_bonus = 0.0
             for class_id in required_present:
                 if class_counts.get(class_id, 0) > 1:
                     multiple_bonus += 0.05  # 5% bonus per additional key object type
@@ -330,10 +323,8 @@ class SceneAnalyzer:
     def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
         """
         Determine the most likely scene type based on scores.
         Args:
             scene_scores: Dictionary mapping scene types to confidence scores
         Returns:
             Tuple of (best_scene_type, confidence)
         """
@@ -350,11 +341,9 @@ class SceneAnalyzer:
     def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
         """
         融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
         Args:
             yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
             clip_scene_scores: 基於 CLIP 分析的場景分數
         Returns:
             Dict: 融合後的場景分數
         """

     def __init__(self, class_names: Dict[int, str] = None):
         """
         Initialize the scene analyzer with optional class name mappings.
         Args:
             class_names: Dictionary mapping class IDs to class names (optional)
         """
                              functional_zones=None):
         """
         生成場景描述。
         Args:
             scene_type: 識別的場景類型
             detected_objects: 檢測到的物體列表
             confidence: 場景分類置信度
             lighting_info: 照明條件信息（可選）
             functional_zones: 功能區域信息（可選）
         Returns:
             str: 生成的場景描述
         """
     def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
         """
         Analyze detection results to determine scene type and provide understanding.
         Args:
             detection_result: Detection result from YOLOv8
             lighting_info: Optional lighting condition analysis results
             class_confidence_threshold: Minimum confidence to consider an object
             scene_confidence_threshold: Minimum confidence to determine a scene
         Returns:
             Dictionary with scene analysis results
         """
         if not detected_objects:
             return {
                 "scene_type": "unknown",
+                "confidence": 0,
                 "description": "No objects with sufficient confidence detected.",
                 "objects_present": [],
                 "object_count": 0,
     def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
         """
         Compute confidence scores for each scene type based on detected objects.
         Args:
             detected_objects: List of detected objects
         Returns:
             Dictionary mapping scene types to confidence scores
         """
             optional_score = optional_ratio * 0.3  # 30% of score from optional objects
             # Bonus for having multiple instances of key objects
+            multiple_bonus = 0
             for class_id in required_present:
                 if class_counts.get(class_id, 0) > 1:
                     multiple_bonus += 0.05  # 5% bonus per additional key object type
     def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
         """
         Determine the most likely scene type based on scores.
         Args:
             scene_scores: Dictionary mapping scene types to confidence scores
         Returns:
             Tuple of (best_scene_type, confidence)
         """
     def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
         """
         融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
         Args:
             yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
             clip_scene_scores: 基於 CLIP 分析的場景分數
         Returns:
             Dict: 融合後的場景分數
         """

style.py CHANGED Viewed

@@ -268,6 +268,40 @@ class Style:
             padding: 0 !important;
         }
         /* 結果容器樣式 */
         .result-container {
             width: 100% !important;
@@ -356,6 +390,111 @@ class Style:
             box-sizing: border-box !important;
         }
         /* 響應式調整 */
         @media (max-width: 768px) {
             .app-title {
@@ -375,5 +514,6 @@ class Style:
                 min-height: 150px !important;
             }
         }
         """
         return css

             padding: 0 !important;
         }
+        /* 場景分析描述區域樣式 */
+        .scene-description-box {
+            background-color: #f8f9fa !important;
+            border: 1px solid #e2e8f0 !important;
+            border-radius: 8px !important;
+            padding: 15px !important;
+            margin: 10px 0 20px 0 !important;
+            box-shadow: 0 1px 3px rgba(0,0,0,0.05) !important;
+            font-family: Arial, sans-serif !important;
+            line-height: 1.7 !important;
+            color: #2D3748 !important;
+            font-size: 16px !important;
+            width: 100% !important;
+            box-sizing: border-box !important;
+        }
+        #scene_analysis_description_text {
+            background-color: #f0f0f0 !important; /* 淺灰色背景 */
+            padding: 15px !important;             /* 內邊距，讓文字和邊框有點空間 */
+            border-radius: 8px !important;        /* 圓角 */
+            margin: 10px 0 20px 0 !important;     /* 其他元素的間距，特別是上下的part */
+            display: block !important;
+            width: 100% !important;
+            box-sizing: border-box !important;
+        }
+        #scene_analysis_description_text p {
+            margin: 0 !important;
+            color: #2D3748 !important; /* 確保文字顏色 */
+            font-family: Arial, sans-serif !important;
+            font-size: 16px !important; /* 你可以調整文字大小 */
+            line-height: 1.7 !important;
+        }
         /* 結果容器樣式 */
         .result-container {
             width: 100% !important;
             box-sizing: border-box !important;
         }
+        /* Video summary HTML 容器與內容樣式 */
+        #video-summary-html-output {
+            width: 100% !important;
+            box-sizing: border-box !important;
+            padding: 0 !important;
+            margin: 0 !important;
+        }
+        .video-summary-content-wrapper {
+            width: 100% !important;
+            padding: 16px !important;
+            line-height: 1.8 !important;
+            white-space: pre-wrap !important;
+            word-wrap: break-word !important;
+            border-radius: 8px !important;
+            min-height: 250px !important;
+            max-height: 600px !important;
+            overflow-y: auto !important;
+            border: 1px solid #e2e8f0 !important;
+            background-color: white !important;
+            display: block !important;
+            font-family: 'Arial', sans-serif !important;
+            font-size: 14px !important;
+            margin: 0 !important;
+        }
+        .video-summary-content-wrapper pre {
+            white-space: pre-wrap !important;
+            word-wrap: break-word !important;
+            margin: 0 !important;
+            padding: 0 !important;
+            font-family: 'Arial', sans-serif !important;
+            font-size: 14px !important;
+            line-height: 1.8 !important;
+            color: #2D3748 !important;
+        }
+        /* 視頻結果面板相關樣式 */
+        .video-result-panel {
+            padding: 1rem !important;
+            background: white !important;
+            border-radius: 10px !important;
+            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08) !important;
+        }
+        .video-output-container {
+            width: 100% !important;
+            margin-bottom: 1.5rem !important;
+            border-radius: 8px !important;
+            overflow: hidden !important;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
+        }
+        /* 視頻統計資料顯示增強 */
+        .video-stats-display {
+            background: white !important;
+            border-radius: 8px !important;
+            padding: 1rem !important;
+            box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
+            width: 100% !important;
+            min-height: 200px !important;
+            max-height: 400px !important;
+            overflow-y: auto !important;
+            font-family: monospace !important;
+            box-sizing: border-box !important;
+            color: #2D3748 !important;
+        }
+        .custom-video-url-input {
+            width: 100% !important;
+        }
+        .custom-video-url-input textarea {
+            width: 100% !important;
+            min-height: 120px !important;
+            padding: 15px !important;
+            font-size: 16px !important;
+            line-height: 1.6 !important;
+            background-color: #F7FAFC !important;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
+            border: 2px solid #CBD5E0 !important;
+            border-radius: 8px !important;
+        }
+        .custom-video-url-input textarea:focus {
+            border-color: #4299E1 !important;
+            box-shadow: 0 0 0 3px rgba(66, 153, 225, 0.2) !important;
+        }
+        /* 輸入框容器100%寬度 */
+        .custom-video-url-input > div {
+            width: 100% !important;
+            max-width: 100% !important;
+        }
+        /* 動畫效果, 增加互動感 */
+        @keyframes fadeIn {
+            from { opacity: 0; }
+            to { opacity: 1; }
+        }
+        .video-result-panel > * {
+            animation: fadeIn 0.5s ease-in-out;
+        }
         /* 響應式調整 */
         @media (max-width: 768px) {
             .app-title {
                 min-height: 150px !important;
             }
         }
         """
         return css

video_processor.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import cv2
+import os
+import tempfile
+import uuid
+from PIL import Image
+import numpy as np
+from typing import Dict, List, Tuple, Any, Optional
+import time
+from collections import defaultdict
+from image_processor import ImageProcessor
+from evaluation_metrics import EvaluationMetrics
+from scene_analyzer import SceneAnalyzer
+from detection_model import DetectionModel
+class VideoProcessor:
+    """
+    Handles the processing of video files, including object detection
+    and scene analysis on selected frames.
+    """
+    def __init__(self, image_processor: ImageProcessor):
+        """
+        Initializes the VideoProcessor.
+        Args:
+            image_processor (ImageProcessor): An initialized ImageProcessor instance.
+        """
+        self.image_processor = image_processor
+    def process_video_file(self,
+                           video_path: str,
+                           model_name: str,
+                           confidence_threshold: float,
+                           process_interval: int = 5,
+                           scene_desc_interval_sec: int = 3) -> Tuple[Optional[str], str, Dict]:
+        """
+        Processes an uploaded video file, performs detection and periodic scene analysis,
+        and returns the path to the annotated output video file along with a summary.
+        Args:
+            video_path (str): Path to the input video file.
+            model_name (str): Name of the YOLO model to use.
+            confidence_threshold (float): Confidence threshold for object detection.
+            process_interval (int): Process every Nth frame. Defaults to 5.
+            scene_desc_interval_sec (int): Update scene description every N seconds. Defaults to 3.
+        Returns:
+            Tuple[Optional[str], str, Dict]: (Path to output video or None, Summary text, Statistics dictionary)
+        """
+        if not video_path or not os.path.exists(video_path):
+            print(f"Error: Video file not found at {video_path}")
+            return None, "Error: Video file not found.", {}
+        print(f"Starting video processing for: {video_path}")
+        start_time = time.time()
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            print(f"Error: Could not open video file {video_path}")
+            return None, "Error opening video file.", {}
+        # Get video properties
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        if fps <= 0: # Handle case where fps is not available or invalid
+             fps = 30 # Assume a default fps
+             print(f"Warning: Could not get valid FPS for video. Assuming {fps} FPS.")
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        print(f"Video properties: {width}x{height} @ {fps:.2f} FPS, Total Frames: {total_frames_video}")
+        # Calculate description update interval in frames
+        description_update_interval_frames = int(fps * scene_desc_interval_sec)
+        if description_update_interval_frames < 1:
+            description_update_interval_frames = int(fps) # Update at least once per second if interval is too short
+        object_trackers = {}  # 儲存ID與物體的映射
+        last_detected_objects = {}  # 儲存上一次檢測到的物體資訊
+        next_object_id = 0  # 下一個可用的物體ID
+        tracking_threshold = 0.6  # 相同物體的IoU
+        object_colors = {}  # 每個被追蹤的物體分配固定顏色
+        # Setup Output Video
+        output_filename = f"processed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
+        temp_dir = tempfile.gettempdir() # Use system's temp directory
+        output_path = os.path.join(temp_dir, output_filename)
+        # Ensure the output path has a compatible extension (like .mp4)
+        if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
+            output_path += ".mp4"
+        # Use 'mp4v' for MP4, common and well-supported
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        if not out.isOpened():
+            print(f"Error: Could not open VideoWriter for path: {output_path}")
+            cap.release()
+            return None, f"Error creating output video file at {output_path}.", {}
+        print(f"Output video will be saved to: {output_path}")
+        frame_count = 0
+        processed_frame_count = 0
+        all_stats = [] # Store stats for each processed frame
+        summary_lines = []
+        last_description = "Analyzing scene..." # Initial description
+        frame_since_last_desc = description_update_interval_frames # Trigger analysis on first processed frame
+        try:
+            while True:
+                ret, frame = cap.read()
+                if not ret:
+                    break # End of video
+                frame_count += 1
+                frame_since_last_desc += 1
+                current_frame_annotated = False # Flag if this frame was processed and annotated
+                # Process frame based on interval
+                if frame_count % process_interval == 0:
+                    processed_frame_count += 1
+                    print(f"Processing frame {frame_count}...")
+                    current_frame_annotated = True
+                    # Use ImageProcessor for single-frame tasks
+                    # 1. Convert frame format BGR -> RGB -> PIL
+                    try:
+                        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        pil_image = Image.fromarray(frame_rgb)
+                    except Exception as e:
+                        print(f"Error converting frame {frame_count}: {e}")
+                        continue # Skip this frame
+                    # 2. Get appropriate model instance
+                    # Confidence is passed from UI, model_name too
+                    model_instance = self.image_processor.get_model_instance(model_name, confidence_threshold)
+                    if not model_instance or not model_instance.is_model_loaded:
+                         print(f"Error: Model {model_name} not loaded. Skipping frame {frame_count}.")
+                         # Draw basic frame without annotation
+                         cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
+                         cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
+                         out.write(frame)
+                         continue
+                    # 3. Perform detection
+                    detection_result = model_instance.detect(pil_image) # Use PIL image
+                    current_description_for_frame = last_description # Default to last known description
+                    scene_analysis_result = None
+                    stats = {}
+                    if detection_result and hasattr(detection_result, 'boxes') and len(detection_result.boxes) > 0:
+                        # Ensure SceneAnalyzer is ready within ImageProcessor
+                        if not hasattr(self.image_processor, 'scene_analyzer') or self.image_processor.scene_analyzer is None:
+                             print("Initializing SceneAnalyzer...")
+                             # Pass class names from the current detection result
+                             self.image_processor.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
+                        elif self.image_processor.scene_analyzer.class_names is None:
+                             # Update class names if they were missing
+                             self.image_processor.scene_analyzer.class_names = detection_result.names
+                             if hasattr(self.image_processor.scene_analyzer, 'spatial_analyzer'):
+                                 self.image_processor.scene_analyzer.spatial_analyzer.class_names = detection_result.names
+                        # 4. Perform Scene Analysis (periodically)
+                        if frame_since_last_desc >= description_update_interval_frames:
+                            print(f"Analyzing scene at frame {frame_count} (threshold: {description_update_interval_frames} frames)...")
+                            # Pass lighting_info=None for now, as it's disabled for performance
+                            scene_analysis_result = self.image_processor.analyze_scene(detection_result, lighting_info=None)
+                            current_description_for_frame = scene_analysis_result.get("description", last_description)
+                            last_description = current_description_for_frame # Cache the new description
+                            frame_since_last_desc = 0 # Reset counter
+                        # 5. Calculate Statistics for this frame
+                        stats = EvaluationMetrics.calculate_basic_stats(detection_result)
+                        stats['frame_number'] = frame_count # Add frame number to stats
+                        all_stats.append(stats)
+                        # 6. Draw annotations
+                        names = detection_result.names
+                        boxes = detection_result.boxes.xyxy.cpu().numpy()
+                        classes = detection_result.boxes.cls.cpu().numpy().astype(int)
+                        confs = detection_result.boxes.conf.cpu().numpy()
+                        def calculate_iou(box1, box2):
+                            """Calculate Intersection IOU value"""
+                            x1_1, y1_1, x2_1, y2_1 = box1
+                            x1_2, y1_2, x2_2, y2_2 = box2
+                            xi1 = max(x1_1, x1_2)
+                            yi1 = max(y1_1, y1_2)
+                            xi2 = min(x2_1, x2_2)
+                            yi2 = min(y2_1, y2_2)
+                            inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
+                            box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
+                            box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
+                            union_area = box1_area + box2_area - inter_area
+                            return inter_area / union_area if union_area > 0 else 0
+                        # 處理當前幀中的所有檢測
+                        current_detected_objects = {}
+                        for box, cls_id, conf in zip(boxes, classes, confs):
+                            x1, y1, x2, y2 = map(int, box)
+                            # 查找最匹配的已追蹤物體
+                            best_match_id = None
+                            best_match_iou = 0
+                            for obj_id, (old_box, old_cls_id, _) in last_detected_objects.items():
+                                if old_cls_id == cls_id:  # 同一類別才比較
+                                    iou = calculate_iou(box, old_box)
+                                    if iou > tracking_threshold and iou > best_match_iou:
+                                        best_match_id = obj_id
+                                        best_match_iou = iou
+                            # 如果找到匹配，使用現有ID；否則分配新ID
+                            if best_match_id is not None:
+                                obj_id = best_match_id
+                            else:
+                                obj_id = next_object_id
+                                next_object_id += 1
+                                # 為新物體分配固定顏色 - 使用更明顯的顏色
+                                # 使用更明顯的顏色，避免白色
+                                bright_colors = [
+                                    (0, 0, 255),    # red
+                                    (0, 255, 0),    # green
+                                    (255, 0, 0),    # blue
+                                    (0, 255, 255),  # yellow
+                                    (255, 0, 255),  # purple
+                                    (255, 128, 0),  # orange
+                                    (128, 0, 255)   # purple
+                                ]
+                                object_colors[obj_id] = bright_colors[obj_id % len(bright_colors)]
+                            # update tracking info
+                            current_detected_objects[obj_id] = (box, cls_id, conf)
+                            color = object_colors.get(obj_id, (0, 255, 0))  # default is green
+                            label = f"{names.get(cls_id, 'Unknown')}-{obj_id}: {conf:.2f}"
+                            # 平滑化邊界框：如果是已知物體，與上一幀位置平均
+                            if obj_id in last_detected_objects:
+                                old_box, _, _ = last_detected_objects[obj_id]
+                                old_x1, old_y1, old_x2, old_y2 = map(int, old_box)
+                                # 平滑係數
+                                alpha = 0.7  # current weight
+                                beta = 0.3   # history weight
+                                x1 = int(alpha * x1 + beta * old_x1)
+                                y1 = int(alpha * y1 + beta * old_y1)
+                                x2 = int(alpha * x2 + beta * old_x2)
+                                y2 = int(alpha * y2 + beta * old_y2)
+                            # draw box and label
+                            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
+                            # add text
+                            (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
+                            cv2.rectangle(frame, (x1, y1 - h - 10), (x1 + w, y1 - 10), color, -1)
+                            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
+                        # update tracking info
+                        last_detected_objects = current_detected_objects.copy()
+                    # Draw the current scene description on the frame
+                    cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA) # Black outline
+                    cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA) # White text
+                # Write the frame (annotated or original) to the output video
+                # Draw last known description if this frame wasn't processed
+                if not current_frame_annotated:
+                    cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
+                    cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
+                out.write(frame) # Write frame to output file
+        except Exception as e:
+            print(f"Error during video processing loop for {video_path}: {e}")
+            import traceback
+            traceback.print_exc()
+            summary_lines.append(f"An error occurred during processing: {e}")
+        finally:
+            # Release resources
+            cap.release()
+            out.release()
+            print(f"Video processing finished. Resources released. Output path: {output_path}")
+            if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
+                print(f"Error: Output video file was not created or is empty at {output_path}")
+                summary_lines.append("Error: Failed to create output video.")
+                output_path = None
+        end_time = time.time()
+        processing_time = end_time - start_time
+        summary_lines.insert(0, f"Finished processing in {processing_time:.2f} seconds.")
+        summary_lines.insert(1, f"Processed {processed_frame_count} frames out of {frame_count} (interval: {process_interval} frames).")
+        summary_lines.insert(2, f"Scene description updated approximately every {scene_desc_interval_sec} seconds.")
+        # Generate Aggregate Statistics
+        aggregated_stats = {
+            "total_frames_read": frame_count,
+            "total_frames_processed": processed_frame_count,
+            "avg_objects_per_processed_frame": 0, # Calculate below
+            "cumulative_detections": {}, # Total times each class was detected
+            "max_concurrent_detections": {} # Max count of each class in a single processed frame
+            }
+        object_cumulative_counts = {}
+        object_max_concurrent_counts = {} # Store the max count found for each object type
+        total_detected_in_processed = 0
+        # Iterate through stats collected from each processed frame
+        for frame_stats in all_stats:
+            total_objects_in_frame = frame_stats.get("total_objects", 0)
+            total_detected_in_processed += total_objects_in_frame
+            # Iterate through object classes detected in this frame
+            for obj_name, obj_data in frame_stats.get("class_statistics", {}).items():
+                count_in_frame = obj_data.get("count", 0)
+                # Cumulative count
+                if obj_name not in object_cumulative_counts:
+                    object_cumulative_counts[obj_name] = 0
+                object_cumulative_counts[obj_name] += count_in_frame
+                # Max concurrent count
+                if obj_name not in object_max_concurrent_counts:
+                    object_max_concurrent_counts[obj_name] = 0
+                # Update the max count if the current frame's count is higher
+                object_max_concurrent_counts[obj_name] = max(object_max_concurrent_counts[obj_name], count_in_frame)
+        # Add sorted results to the final dictionary
+        aggregated_stats["cumulative_detections"] = dict(sorted(object_cumulative_counts.items(), key=lambda item: item[1], reverse=True))
+        aggregated_stats["max_concurrent_detections"] = dict(sorted(object_max_concurrent_counts.items(), key=lambda item: item[1], reverse=True))
+        # Calculate average objects per processed frame
+        if processed_frame_count > 0:
+             aggregated_stats["avg_objects_per_processed_frame"] = round(total_detected_in_processed / processed_frame_count, 2)
+        summary_text = "\n".join(summary_lines)
+        print("Generated Summary:\n", summary_text)
+        print("Aggregated Stats (Revised):\n", aggregated_stats) # Print the revised stats
+        # Return the potentially updated output_path
+        return output_path, summary_text, aggregated_stats