DawnC commited on
Commit
c0fe80d
·
verified ·
1 Parent(s): b62df99

Add new feature "Video Process" and fix format issue

Browse files
Files changed (8) hide show
  1. app.py +514 -360
  2. clip_analyzer.py +2 -1
  3. enhance_scene_describer.py +257 -69
  4. lighting_analyzer.py +71 -71
  5. requirements.txt +2 -0
  6. scene_analyzer.py +2 -13
  7. style.py +140 -0
  8. video_processor.py +346 -0
app.py CHANGED
@@ -3,6 +3,10 @@ import numpy as np
3
  import matplotlib.pyplot as plt
4
  import gradio as gr
5
  from typing import Dict, List, Any, Optional, Tuple
 
 
 
 
6
  import spaces
7
 
8
  from detection_model import DetectionModel
@@ -10,441 +14,591 @@ from color_mapper import ColorMapper
10
  from evaluation_metrics import EvaluationMetrics
11
  from style import Style
12
  from image_processor import ImageProcessor
 
13
 
14
- # Initialize image processor
15
  image_processor = ImageProcessor()
 
16
 
 
17
  def get_all_classes():
18
- """
19
- Get all available COCO classes from the currently active model or fallback to standard COCO classes
20
-
21
- Returns:
22
- List of tuples (class_id, class_name)
23
- """
24
- # Try to get class names from any loaded model
25
- for model_name, model_instance in image_processor.model_instances.items():
26
- if model_instance and model_instance.is_model_loaded:
27
- try:
28
- class_names = model_instance.class_names
29
- return [(idx, name) for idx, name in class_names.items()]
30
- except Exception:
31
- pass
32
-
33
- # Fallback to standard COCO classes
34
- return [
35
- (0, 'person'), (1, 'bicycle'), (2, 'car'), (3, 'motorcycle'), (4, 'airplane'),
36
- (5, 'bus'), (6, 'train'), (7, 'truck'), (8, 'boat'), (9, 'traffic light'),
37
- (10, 'fire hydrant'), (11, 'stop sign'), (12, 'parking meter'), (13, 'bench'),
38
- (14, 'bird'), (15, 'cat'), (16, 'dog'), (17, 'horse'), (18, 'sheep'), (19, 'cow'),
39
- (20, 'elephant'), (21, 'bear'), (22, 'zebra'), (23, 'giraffe'), (24, 'backpack'),
40
- (25, 'umbrella'), (26, 'handbag'), (27, 'tie'), (28, 'suitcase'), (29, 'frisbee'),
41
- (30, 'skis'), (31, 'snowboard'), (32, 'sports ball'), (33, 'kite'), (34, 'baseball bat'),
42
- (35, 'baseball glove'), (36, 'skateboard'), (37, 'surfboard'), (38, 'tennis racket'),
43
- (39, 'bottle'), (40, 'wine glass'), (41, 'cup'), (42, 'fork'), (43, 'knife'),
44
- (44, 'spoon'), (45, 'bowl'), (46, 'banana'), (47, 'apple'), (48, 'sandwich'),
45
- (49, 'orange'), (50, 'broccoli'), (51, 'carrot'), (52, 'hot dog'), (53, 'pizza'),
46
- (54, 'donut'), (55, 'cake'), (56, 'chair'), (57, 'couch'), (58, 'potted plant'),
47
- (59, 'bed'), (60, 'dining table'), (61, 'toilet'), (62, 'tv'), (63, 'laptop'),
48
- (64, 'mouse'), (65, 'remote'), (66, 'keyboard'), (67, 'cell phone'), (68, 'microwave'),
49
- (69, 'oven'), (70, 'toaster'), (71, 'sink'), (72, 'refrigerator'), (73, 'book'),
50
- (74, 'clock'), (75, 'vase'), (76, 'scissors'), (77, 'teddy bear'), (78, 'hair drier'),
51
- (79, 'toothbrush')
52
- ]
53
 
54
  @spaces.GPU
55
- def process_and_plot(image, model_name, confidence_threshold, filter_classes=None):
56
- """
57
- Process image and create plots for statistics with enhanced visualization
58
-
59
- Args:
60
- image: Input image
61
- model_name: Name of the model to use
62
- confidence_threshold: Confidence threshold for detection
63
- filter_classes: Optional list of classes to filter results
64
-
65
- Returns:
66
- Tuple of results including lighting conditions
67
- """
68
  try:
69
- class_ids = None
70
  if filter_classes:
71
- class_ids = []
 
 
72
  for class_str in filter_classes:
 
 
73
  try:
74
- # Extract ID from format "id: name"
75
- class_id = int(class_str.split(":")[0].strip())
76
- class_ids.append(class_id)
77
- except:
78
- continue
79
-
80
- # Execute detection
 
 
 
 
 
 
 
 
 
81
  result_image, result_text, stats = image_processor.process_image(
82
  image,
83
  model_name,
84
  confidence_threshold,
85
- class_ids
86
  )
87
 
88
- # Format the statistics for better display
89
  formatted_stats = image_processor.format_json_for_display(stats)
90
 
91
- if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
92
- # Create the table
 
 
 
 
 
 
 
 
 
 
 
93
  fig, ax = plt.subplots(figsize=(8, 6))
94
- ax.text(0.5, 0.5, "No detection data available",
95
- ha='center', va='center', fontsize=14, fontfamily='Arial')
96
- ax.set_xlim(0, 1)
97
- ax.set_ylim(0, 1)
98
  ax.axis('off')
99
  plot_figure = fig
100
- else:
101
- # Prepare visualization data
102
- available_classes = dict(get_all_classes())
103
- viz_data = image_processor.prepare_visualization_data(stats, available_classes)
104
-
105
- # Create plot
106
- plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
107
 
108
  # Extract scene analysis info
109
  scene_analysis = stats.get("scene_analysis", {})
110
-
111
- scene_desc = scene_analysis.get("description", "No scene analysis available.")
112
- scene_desc = scene_desc.strip()
113
-
114
- # HTML format
115
- scene_desc_html = f"""
116
- <div id='scene-desc-container' style='width:100%; padding:20px; text-align:center; background-color:#f5f9fc; border-radius:8px; margin:10px auto; min-height:200px; max-height:none; overflow-y:auto;'>
117
- <div style='width:100%; text-align:center; margin:0 auto; font-family:Arial, sans-serif; font-size:14px; line-height:1.8;'>
118
- {scene_desc}
119
- </div>
120
- </div>
121
- """
122
-
123
- # Extract lighting conditions
124
- lighting_conditions = scene_analysis.get("lighting_conditions",
125
- {"time_of_day": "unknown", "confidence": 0.0})
126
-
127
- # 準備活動列表
128
- activities = scene_analysis.get("possible_activities", [])
129
- if not activities:
130
- activities_data = [["No activities detected"]]
131
  else:
132
- activities_data = [[activity] for activity in activities]
133
 
134
- # 準備安全注意事項列表
135
- safety_concerns = scene_analysis.get("safety_concerns", [])
136
- if not safety_concerns:
137
- safety_data = [["No safety concerns detected"]]
138
  else:
139
- safety_data = [[concern] for concern in safety_concerns]
140
 
141
- # 功能區域
142
  zones = scene_analysis.get("functional_zones", {})
 
143
 
144
- return result_image, result_text, formatted_stats, plot_figure, scene_desc, activities_data, safety_data, zones, lighting_conditions
 
145
 
146
  except Exception as e:
147
- # 確保即使出錯也能返回有效的數據
148
  import traceback
149
  error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
150
- print(error_msg)
151
-
152
- # 創建一個簡單的錯誤圖
153
- fig, ax = plt.subplots(figsize=(8, 6))
154
- ax.text(0.5, 0.5, f"Error: {str(e)}",
155
- ha='center', va='center', fontsize=14, fontfamily='Arial', color='red')
156
- ax.set_xlim(0, 1)
157
- ax.set_ylim(0, 1)
158
  ax.axis('off')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- # 返回有效的默認值
161
- return None, error_msg, "{}", fig, "Error processing image", [["No activities"]], [["No safety concerns"]], {}, {"time_of_day": "unknown", "confidence": 0}
162
 
 
163
  def create_interface():
164
- """創建 Gradio 界面"""
165
  css = Style.get_css()
166
-
167
- # 獲取可用模型信息
168
  available_models = DetectionModel.get_available_models()
169
  model_choices = [model["model_file"] for model in available_models]
170
- model_labels = [f"{model['name']} - {model['inference_speed']}" for model in available_models]
171
-
172
- # 可用類別過濾選項
173
- available_classes = get_all_classes()
174
- class_choices = [f"{id}: {name}" for id, name in available_classes]
175
 
176
- # 創建 Gradio Blocks 界面
177
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
178
- # 主頁頂部的標題
 
179
  with gr.Group(elem_classes="app-header"):
180
  gr.HTML("""
181
  <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
182
  <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
183
-
184
- <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Detect and identify objects in your images</h2>
185
-
186
- <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;">
187
- <div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div>
188
- </div>
189
-
190
  <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
191
- <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
192
- <span style="margin-right: 6px;">🔍</span> Object Detection
193
- </div>
194
- <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
195
- <span style="margin-right: 6px;">🌐</span> Scene Understanding
196
- </div>
197
- <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
198
- <span style="margin-right: 6px;">📊</span> Visual Analysis
199
- </div>
200
- </div>
201
-
202
- <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
203
- <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
204
- <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images are not supported.
205
- <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG here</a> before uploading.
206
- </p>
207
  </div>
 
 
 
 
 
 
208
  </div>
209
  """)
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- current_model = gr.State("yolov8m.pt") # use medium size model as defualt
213
-
214
- # 主要內容區
215
- with gr.Row(equal_height=True):
216
- # 左側 - 輸入控制區(可上傳圖片)
217
- with gr.Column(scale=4, elem_classes="input-panel"):
218
- with gr.Group():
219
- gr.HTML('<div class="section-heading">Upload Image</div>')
220
- image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")
221
-
222
- with gr.Accordion("Advanced Settings", open=False):
223
- with gr.Row():
224
- model_dropdown = gr.Dropdown(
225
- choices=model_choices,
226
- value="yolov8m.pt",
227
- label="Select Model",
228
- info="Choose different models based on your needs for speed vs. accuracy"
229
- )
230
-
231
- # display model info
232
- model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))
233
-
234
- confidence = gr.Slider(
235
- minimum=0.1,
236
- maximum=0.9,
237
- value=0.25,
238
- step=0.05,
239
- label="Confidence Threshold",
240
- info="Higher values show fewer but more confident detections"
241
- )
242
-
243
- with gr.Accordion("Filter Classes", open=False):
244
- # 常見物件類別快速選擇按鈕
245
- gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
246
- with gr.Row():
247
- people_btn = gr.Button("People", size="sm")
248
- vehicles_btn = gr.Button("Vehicles", size="sm")
249
- animals_btn = gr.Button("Animals", size="sm")
250
- objects_btn = gr.Button("Common Objects", size="sm")
251
-
252
- # 類別選擇下拉框
253
- class_filter = gr.Dropdown(
254
- choices=class_choices,
255
- multiselect=True,
256
- label="Select Classes to Display",
257
- info="Leave empty to show all detected objects"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  )
259
 
260
- # detect buttom
261
- detect_btn = gr.Button("Detect Objects", variant="primary", elem_classes="detect-btn")
262
-
263
- # 使用說明區
264
- with gr.Group(elem_classes="how-to-use"):
265
- gr.HTML('<div class="section-heading">How to Use</div>')
266
- gr.Markdown("""
267
- 1. Upload an image or use the camera
268
- 2. (Optional) Adjust settings like confidence threshold or model size (n, m, x)
269
- 3. Optionally filter to specific object classes
270
- 4. Click "Detect Objects" button
271
-
272
- The model will identify objects in your image and display them with bounding boxes.
273
-
274
- **Note:** Detection quality depends on image clarity and model settings.
275
- """)
276
-
277
- # 右側 - 結果顯示區
278
- with gr.Column(scale=6, elem_classes="output-panel"):
279
- with gr.Tabs(elem_classes="tabs"):
280
- with gr.Tab("Detection Result"):
281
- result_image = gr.Image(type="pil", label="Detection Result")
282
-
283
- # details summary
284
- with gr.Group(elem_classes="result-details-box"):
285
- gr.HTML('<div class="section-heading">Detection Details</div>')
286
- # 文本框設置,讓顯示會更寬
287
- result_text = gr.Textbox(
288
- label=None,
289
- lines=15,
290
- max_lines=20,
291
- elem_classes="wide-result-text",
292
- elem_id="detection-details",
293
- container=False,
294
- scale=2,
295
- min_width=600
296
- )
297
 
298
- # Scene Analysis
299
- with gr.Tab("Scene Understanding", elem_classes="scene-understanding-tab"):
300
- with gr.Group(elem_classes="result-details-box"):
301
- gr.HTML("""
302
- <div class="section-heading">Scene Analysis</div>
303
- <details class="info-details" style="margin: 5px 0 15px 0;">
304
- <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
305
- 🔍 The AI Vision Scout Report: Click for important notes about this analysis
306
- </summary>
307
- <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
308
- <p style="font-size: 13px; color: #718096; margin: 0;">
309
- <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
310
- Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
311
- Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
312
  </p>
313
  </div>
314
- </details>
315
- """)
316
-
317
- # 使用更適合長文本的容器
318
- with gr.Group(elem_classes="scene-description-container"):
319
- scene_description = gr.HTML(
320
- value="<div id='scene-desc-container'></div>",
321
- label="Scene Description"
322
- )
323
-
324
- with gr.Row():
325
- with gr.Column(scale=2):
326
- activities_list = gr.Dataframe(
327
- headers=["Activities"],
328
- datatype=["str"],
329
- col_count=1,
330
- row_count=5,
331
- elem_classes="full-width-element"
332
- )
333
-
334
- with gr.Column(scale=2):
335
- safety_list = gr.Dataframe(
336
- headers=["Safety Concerns"],
337
- datatype=["str"],
338
- col_count=1,
339
- row_count=5,
340
- elem_classes="full-width-element"
341
- )
342
-
343
- gr.HTML('<div class="section-heading">Functional Zones</div>')
344
- zones_json = gr.JSON(label=None, elem_classes="json-box")
345
-
346
- gr.HTML('<div class="section-heading">Lighting Conditions</div>')
347
- lighting_info = gr.JSON(label=None, elem_classes="json-box")
348
-
349
- with gr.Tab("Statistics"):
350
- with gr.Row():
351
- with gr.Column(scale=3, elem_classes="plot-column"):
352
- gr.HTML('<div class="section-heading">Object Distribution</div>')
353
- plot_output = gr.Plot(
354
- label=None,
355
- elem_classes="large-plot-container"
356
  )
357
-
358
- # 右側放 JSON 數據比較清晰
359
- with gr.Column(scale=2, elem_classes="stats-column"):
360
- gr.HTML('<div class="section-heading">Detection Statistics</div>')
361
- stats_json = gr.JSON(
362
- label=None, # remove label
363
- elem_classes="enhanced-json-display"
364
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
- detect_btn.click(
367
- fn=process_and_plot,
368
- inputs=[image_input, current_model, confidence, class_filter],
369
- outputs=[
370
- result_image, result_text, stats_json, plot_output,
371
- scene_description, activities_list, safety_list, zones_json,
372
- lighting_info
373
- ]
374
- )
375
-
376
- # model option
377
- model_dropdown.change(
378
- fn=lambda model: (model, DetectionModel.get_model_description(model)),
379
- inputs=[model_dropdown],
380
- outputs=[current_model, model_info]
381
- )
382
-
383
- # each classes link
384
- people_classes = [0] # 人
385
- vehicles_classes = [1, 2, 3, 4, 5, 6, 7, 8] # 各種車輛
386
- animals_classes = list(range(14, 24)) # COCO 中的動物
387
- common_objects = [41, 42, 43, 44, 45, 67, 73, 74, 76] # 常見家居物品
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
- # Linked the quik buttom
390
- people_btn.click(
391
- lambda: [f"{id}: {name}" for id, name in available_classes if id in people_classes],
392
- outputs=class_filter
393
- )
394
 
395
- vehicles_btn.click(
396
- lambda: [f"{id}: {name}" for id, name in available_classes if id in vehicles_classes],
397
- outputs=class_filter
 
 
 
398
  )
399
 
400
- animals_btn.click(
401
- lambda: [f"{id}: {name}" for id, name in available_classes if id in animals_classes],
402
- outputs=class_filter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  )
404
 
405
- objects_btn.click(
406
- lambda: [f"{id}: {name}" for id, name in available_classes if id in common_objects],
407
- outputs=class_filter
 
 
 
 
 
 
408
  )
409
 
410
- example_images = [
411
- "room_01.jpg",
412
- "room_02.jpg",
413
- "street_02.jpg",
414
- "street_04.jpg"
415
- ]
416
-
417
- # add example images
418
- gr.Examples(
419
- examples=example_images,
420
- inputs=image_input,
421
- outputs=None,
422
- fn=None,
423
- cache_examples=False,
424
  )
425
 
426
-
427
  # Footer
428
  gr.HTML("""
429
- <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
430
- <div style="margin-bottom: 15px;">
431
- <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP and Ultralytics • Created with Gradio</p>
432
- <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Model can detect 80 different classes of objects</p>
433
- </div>
434
-
435
- <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
436
- <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
437
- <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" style="text-decoration: none;">
438
- <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
439
- </a>
440
- </div>
441
- </div>
442
- """)
443
 
444
  return demo
445
 
 
446
  if __name__ == "__main__":
447
- import time
448
 
449
- demo = create_interface()
450
- demo.launch()
 
3
  import matplotlib.pyplot as plt
4
  import gradio as gr
5
  from typing import Dict, List, Any, Optional, Tuple
6
+ import cv2
7
+ from PIL import Image
8
+ import tempfile
9
+ import uuid
10
  import spaces
11
 
12
  from detection_model import DetectionModel
 
14
  from evaluation_metrics import EvaluationMetrics
15
  from style import Style
16
  from image_processor import ImageProcessor
17
+ from video_processor import VideoProcessor
18
 
19
+ # Initialize Processors
20
  image_processor = ImageProcessor()
21
+ video_processor = VideoProcessor(image_processor)
22
 
23
+ # Helper Function
24
  def get_all_classes():
25
+ """Gets all available COCO classes."""
26
+ # Try to get from a loaded model first
27
+ if image_processor and image_processor.model_instances:
28
+ for model_instance in image_processor.model_instances.values():
29
+ if model_instance and model_instance.is_model_loaded:
30
+ try:
31
+ # Ensure class_names is a dict {id: name}
32
+ if isinstance(model_instance.class_names, dict):
33
+ return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
34
+ except Exception as e:
35
+ print(f"Error getting class names from model: {e}")
36
+
37
+ # Fallback to standard COCO (ensure keys are ints)
38
+ default_classes = {
39
+ 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
40
+ 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
41
+ 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
42
+ 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
43
+ 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
44
+ 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
45
+ 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
46
+ 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
47
+ 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
48
+ 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
49
+ 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
50
+ 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
51
+ 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
52
+ 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
53
+ 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
54
+ 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
55
+ }
56
+ return sorted(default_classes.items())
 
 
 
57
 
58
  @spaces.GPU
59
+ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None):
60
+ """Processes a single uploaded image."""
61
+ print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}")
 
 
 
 
 
 
 
 
 
 
62
  try:
63
+ class_ids_to_filter = None
64
  if filter_classes:
65
+ class_ids_to_filter = []
66
+ available_classes_dict = dict(get_all_classes())
67
+ name_to_id = {name: id for id, name in available_classes_dict.items()}
68
  for class_str in filter_classes:
69
+ class_name_or_id = class_str.split(":")[0].strip()
70
+ class_id = -1
71
  try:
72
+ class_id = int(class_name_or_id)
73
+ if class_id not in available_classes_dict:
74
+ class_id = -1
75
+ except ValueError:
76
+ if class_name_or_id in name_to_id:
77
+ class_id = name_to_id[class_name_or_id]
78
+ elif class_str in name_to_id: # Check full string "id: name"
79
+ class_id = name_to_id[class_str]
80
+
81
+ if class_id != -1:
82
+ class_ids_to_filter.append(class_id)
83
+ else:
84
+ print(f"Warning: Could not parse class filter: {class_str}")
85
+ print(f"Filtering image results for class IDs: {class_ids_to_filter}")
86
+
87
+ # Call the existing image processing logic
88
  result_image, result_text, stats = image_processor.process_image(
89
  image,
90
  model_name,
91
  confidence_threshold,
92
+ class_ids_to_filter
93
  )
94
 
95
+ # Format stats for JSON display
96
  formatted_stats = image_processor.format_json_for_display(stats)
97
 
98
+ # Prepare visualization data for the plot
99
+ plot_figure = None
100
+ if stats and "class_statistics" in stats and stats["class_statistics"]:
101
+ available_classes_dict = dict(get_all_classes())
102
+ viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
103
+ if "error" not in viz_data:
104
+ plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
105
+ else:
106
+ fig, ax = plt.subplots(figsize=(8, 6))
107
+ ax.text(0.5, 0.5, viz_data["error"], ha='center', va='center', fontsize=12)
108
+ ax.axis('off')
109
+ plot_figure = fig
110
+ else:
111
  fig, ax = plt.subplots(figsize=(8, 6))
112
+ ax.text(0.5, 0.5, "No detection data for plot", ha='center', va='center', fontsize=12)
 
 
 
113
  ax.axis('off')
114
  plot_figure = fig
 
 
 
 
 
 
 
115
 
116
  # Extract scene analysis info
117
  scene_analysis = stats.get("scene_analysis", {})
118
+ scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
119
+ # Ensure scene_desc is a string before adding HTML
120
+ if not isinstance(scene_desc, str):
121
+ scene_desc = str(scene_desc)
122
+ scene_desc_html = f"<div style='padding:10px; font-family:Arial, sans-serif; line-height:1.7;'>{scene_desc}</div>"
123
+
124
+ # Prepare activities list
125
+ activities_list = scene_analysis.get("possible_activities", [])
126
+ if not activities_list:
127
+ activities_list_data = [["No specific activities inferred"]] # Data for Dataframe
 
 
 
 
 
 
 
 
 
 
 
128
  else:
129
+ activities_list_data = [[activity] for activity in activities_list]
130
 
131
+ # Prepare safety concerns list
132
+ safety_concerns_list = scene_analysis.get("safety_concerns", [])
133
+ if not safety_concerns_list:
134
+ safety_data = [["No safety concerns detected"]] # Data for Dataframe
135
  else:
136
+ safety_data = [[concern] for concern in safety_concerns_list]
137
 
 
138
  zones = scene_analysis.get("functional_zones", {})
139
+ lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})
140
 
141
+ return (result_image, result_text, formatted_stats, plot_figure,
142
+ scene_desc_html, activities_list_data, safety_data, zones, lighting)
143
 
144
  except Exception as e:
145
+ print(f"Error in handle_image_upload: {e}")
146
  import traceback
147
  error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
148
+ fig, ax = plt.subplots()
149
+ ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
 
 
 
 
 
 
150
  ax.axis('off')
151
+ # Ensure return structure matches outputs even on error
152
+ return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>",
153
+ [["Error"]], [["Error"]], {}, {"time_of_day": "error", "confidence": 0})
154
+
155
+ def download_video_from_url(video_url, max_duration_minutes=10):
156
+ """
157
+ Downloads a video from a YouTube URL and returns the local path to the downloaded file.
158
+
159
+ Args:
160
+ video_url (str): URL of the YouTube video to download
161
+ max_duration_minutes (int): Maximum allowed video duration in minutes
162
+
163
+ Returns:
164
+ tuple: (Path to the downloaded video file or None, Error message or None)
165
+ """
166
+ try:
167
+ # Create a temporary directory to store the video
168
+ temp_dir = tempfile.gettempdir()
169
+ output_filename = f"downloaded_{uuid.uuid4().hex}.mp4"
170
+ output_path = os.path.join(temp_dir, output_filename)
171
+
172
+ # Check if it's a YouTube URL
173
+ if "youtube.com" in video_url or "youtu.be" in video_url:
174
+ # Import yt-dlp here to avoid dependency if not needed
175
+ import yt_dlp
176
+
177
+ # Setup yt-dlp options
178
+ ydl_opts = {
179
+ 'format': 'best[ext=mp4]/best', # Best quality MP4 or best available format
180
+ 'outtmpl': output_path,
181
+ 'noplaylist': True,
182
+ 'quiet': False, # Set to True to reduce output
183
+ 'no_warnings': False,
184
+ }
185
+
186
+ # First extract info to check duration
187
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
188
+ print(f"Extracting info from YouTube URL: {video_url}")
189
+ info_dict = ydl.extract_info(video_url, download=False)
190
+
191
+ # Check if video exists
192
+ if not info_dict:
193
+ return None, "Could not retrieve video information. Please check the URL."
194
+
195
+ video_title = info_dict.get('title', 'Unknown Title')
196
+ duration = info_dict.get('duration', 0)
197
+
198
+ print(f"Video title: {video_title}")
199
+ print(f"Video duration: {duration} seconds")
200
+
201
+ # Check video duration
202
+ if duration > max_duration_minutes * 60:
203
+ return None, f"Video is too long ({duration} seconds). Maximum duration is {max_duration_minutes} minutes."
204
+
205
+ # Download the video
206
+ print(f"Downloading YouTube video: {video_title}")
207
+ ydl.download([video_url])
208
+
209
+ # Verify the file exists and has content
210
+ if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
211
+ return None, "Download failed: Empty or missing file."
212
+
213
+ print(f"Successfully downloaded video to: {output_path}")
214
+ return output_path, None
215
+ else:
216
+ return None, "Only YouTube URLs are supported at this time. Please enter a valid YouTube URL."
217
+
218
+ except Exception as e:
219
+ import traceback
220
+ error_details = traceback.format_exc()
221
+ print(f"Error downloading video: {e}\n{error_details}")
222
+ return None, f"Error downloading video: {str(e)}"
223
+
224
+
225
+ @spaces.GPU
226
+ def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
227
+ """Handles video upload or URL input and calls the VideoProcessor."""
228
+
229
+ print(f"Received video request: input_type={input_type}")
230
+ video_path = None
231
+
232
+ # Handle based on input type
233
+ if input_type == "upload" and video_input:
234
+ print(f"Processing uploaded video file")
235
+ video_path = video_input
236
+ elif input_type == "url" and video_url:
237
+ print(f"Processing video from URL: {video_url}")
238
+ # Download video from URL
239
+ video_path, error_message = download_video_from_url(video_url)
240
+ if error_message:
241
+ error_html = f"<div class='video-summary-content-wrapper'><pre>{error_message}</pre></div>"
242
+ return None, error_html, {"error": error_message}
243
+ else:
244
+ print("No valid video input provided.")
245
+ return None, "<div class='video-summary-content-wrapper'><pre>Please upload a video file or provide a valid video URL.</pre></div>", {}
246
+
247
+ print(f"Starting video processing with: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
248
+ try:
249
+ # Call the VideoProcessor method
250
+ output_video_path, summary_text, stats_dict = video_processor.process_video_file(
251
+ video_path=video_path,
252
+ model_name=model_name,
253
+ confidence_threshold=confidence_threshold,
254
+ process_interval=int(process_interval) # Ensure interval is int
255
+ )
256
+ print(f"Video processing function returned: path={output_video_path}, summary length={len(summary_text)}")
257
+
258
+ # Wrap processing summary in HTML tags for consistent styling with scene understanding page
259
+ summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_text}</pre></div>"
260
+
261
+ # Format statistics for better display
262
+ formatted_stats = {}
263
+ if stats_dict and isinstance(stats_dict, dict):
264
+ formatted_stats = stats_dict
265
+
266
+ return output_video_path, summary_html, formatted_stats
267
+
268
+ except Exception as e:
269
+ print(f"Error in handle_video_upload: {e}")
270
+ import traceback
271
+ error_msg = f"Error processing video: {str(e)}\n{traceback.format_exc()}"
272
+ error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
273
+ return None, error_html, {"error": str(e)}
274
 
 
 
275
 
276
+ # Create Gradio Interface
277
  def create_interface():
278
+ """Creates the Gradio interface with Tabs."""
279
  css = Style.get_css()
 
 
280
  available_models = DetectionModel.get_available_models()
281
  model_choices = [model["model_file"] for model in available_models]
282
+ class_choices_formatted = [f"{id}: {name}" for id, name in get_all_classes()] # Use formatted choices
 
 
 
 
283
 
 
284
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
285
+
286
+ # Header
287
  with gr.Group(elem_classes="app-header"):
288
  gr.HTML("""
289
  <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
290
  <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
291
+ <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Object Detection and Scene Understanding</h2>
292
+ <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
 
 
 
 
 
293
  <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
294
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
295
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  </div>
297
+ <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
298
+ <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
299
+ <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images may not be supported.
300
+ <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG</a> before uploading if needed.
301
+ </p>
302
+ </div>
303
  </div>
304
  """)
305
 
306
+ # Main Content with Tabs
307
+ with gr.Tabs(elem_classes="tabs"):
308
+
309
+ # Tab 1: Image Processing
310
+ with gr.Tab("Image Processing"):
311
+ current_image_model = gr.State("yolov8m.pt") # State for image model selection
312
+ with gr.Row(equal_height=False): # Allow columns to have different heights
313
+ # Left Column: Image Input & Controls
314
+ with gr.Column(scale=4, elem_classes="input-panel"):
315
+ with gr.Group():
316
+ gr.HTML('<div class="section-heading">Upload Image</div>')
317
+ image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")
318
+
319
+ with gr.Accordion("Image Analysis Settings", open=False):
320
+ image_model_dropdown = gr.Dropdown(
321
+ choices=model_choices,
322
+ value="yolov8m.pt", # Default for images
323
+ label="Select Model",
324
+ info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
325
+ )
326
+ # Display model info
327
+ image_model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))
328
 
329
+ image_confidence = gr.Slider(
330
+ minimum=0.1, maximum=0.9, value=0.25, step=0.05,
331
+ label="Confidence Threshold",
332
+ info="Minimum confidence for displaying a detected object"
333
+ )
334
+ with gr.Accordion("Filter Classes", open=False):
335
+ gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
336
+ with gr.Row():
337
+ people_btn = gr.Button("People", size="sm")
338
+ vehicles_btn = gr.Button("Vehicles", size="sm")
339
+ animals_btn = gr.Button("Animals", size="sm")
340
+ objects_btn = gr.Button("Common Objects", size="sm")
341
+ image_class_filter = gr.Dropdown(
342
+ choices=class_choices_formatted, # Use formatted choices
343
+ multiselect=True,
344
+ label="Select Classes to Display",
345
+ info="Leave empty to show all detected objects"
346
+ )
347
+
348
+ image_detect_btn = gr.Button("Analyze Image", variant="primary", elem_classes="detect-btn")
349
+
350
+ with gr.Group(elem_classes="how-to-use"):
351
+ gr.HTML('<div class="section-heading">How to Use (Image)</div>')
352
+ gr.Markdown("""
353
+ 1. Upload an image or use the camera
354
+ 2. (Optional) Adjust settings like confidence threshold or model size (n, m, x)
355
+ 3. Optionally filter to specific object classes
356
+ 4. Click **Detect Objects** button
357
+ """)
358
+ # Image Examples
359
+ gr.Examples(
360
+ examples=[
361
+ "room_01.jpg",
362
+ "room_02.jpg",
363
+ "street_02.jpg",
364
+ "street_04.jpg"
365
+ ],
366
+ inputs=image_input,
367
+ label="Example Images"
368
+ )
369
+
370
+ # Right Column: Image Results
371
+ with gr.Column(scale=6, elem_classes="output-panel"):
372
+ with gr.Tabs(elem_classes="tabs"):
373
+ with gr.Tab("Detection Result"):
374
+ image_result_image = gr.Image(type="pil", label="Detection Result")
375
+ gr.HTML('<div class="section-heading">Detection Details</div>')
376
+ image_result_text = gr.Textbox(label=None, lines=10, elem_id="detection-details", container=False)
377
+
378
+ with gr.Tab("Scene Understanding"):
379
+ gr.HTML('<div class="section-heading">Scene Analysis</div>')
380
+ gr.HTML("""
381
+ <details class="info-details" style="margin: 5px 0 15px 0;">
382
+ <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
383
+ 🔍 The AI Vision Scout Report: Click for important notes about this analysis
384
+ </summary>
385
+ <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
386
+ <p style="font-size: 13px; color: #718096; margin: 0;">
387
+ <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
388
+ Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
389
+ Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
390
+ </p>
391
+ </div>
392
+ </details>
393
+ """)
394
+
395
+ # Wrap HTML description for potential styling
396
+ image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
397
+
398
+ with gr.Row():
399
+ with gr.Column(scale=1):
400
+ gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
401
+ image_activities_list = gr.Dataframe(headers=["Activity"], datatype=["str"], row_count=5, col_count=1, wrap=True)
402
+
403
+ with gr.Column(scale=1):
404
+ gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
405
+ image_safety_list = gr.Dataframe(headers=["Concern"], datatype=["str"], row_count=5, col_count=1, wrap=True)
406
+
407
+ gr.HTML('<div class="section-heading">Functional Zones</div>')
408
+ image_zones_json = gr.JSON(label=None, elem_classes="json-box")
409
+
410
+ gr.HTML('<div class="section-heading">Lighting Conditions</div>')
411
+ image_lighting_info = gr.JSON(label=None, elem_classes="json-box")
412
+
413
+ with gr.Tab("Statistics"):
414
+ with gr.Row():
415
+ with gr.Column(scale=3, elem_classes="plot-column"):
416
+ gr.HTML('<div class="section-heading">Object Distribution</div>')
417
+ image_plot_output = gr.Plot(label=None, elem_classes="large-plot-container")
418
+ with gr.Column(scale=2, elem_classes="stats-column"):
419
+ gr.HTML('<div class="section-heading">Detection Statistics</div>')
420
+ image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
421
+
422
+ # Tab 2: Video Processing
423
+ with gr.Tab("Video Processing"):
424
+ with gr.Row(equal_height=False):
425
+ # Left Column: Video Input & Controls
426
+ with gr.Column(scale=4, elem_classes="input-panel"):
427
+ with gr.Group():
428
+ gr.HTML('<div class="section-heading">Video Input</div>')
429
+
430
+ # Add input type selection
431
+ video_input_type = gr.Radio(
432
+ ["upload", "url"],
433
+ label="Input Method",
434
+ value="upload",
435
+ info="Choose how to provide the video"
436
  )
437
 
438
+ # File upload (will be shown/hidden based on selection)
439
+ with gr.Group(elem_id="upload-video-group"):
440
+ video_input = gr.Video(
441
+ label="Upload a video file (MP4, AVI, MOV)",
442
+ sources=["upload"],
443
+ visible=True
444
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
+ # URL input (will be shown/hidden based on selection)
447
+ with gr.Group(elem_id="url-video-group"):
448
+ video_url_input = gr.Textbox(
449
+ label="Enter video URL (YouTube or direct video link)",
450
+ placeholder="https://www.youtube.com/watch?v=...",
451
+ visible=False,
452
+ elem_classes="custom-video-url-input"
453
+ )
454
+ gr.HTML("""
455
+ <div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
456
+ <p style="margin: 0; color: #4b5563;">
457
+ Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes.
 
 
458
  </p>
459
  </div>
460
+ """)
461
+
462
+ with gr.Accordion("Video Analysis Settings", open=True):
463
+ video_model_dropdown = gr.Dropdown(
464
+ choices=model_choices,
465
+ value="yolov8n.pt", # Default 'n' for video
466
+ label="Select Model (Video)",
467
+ info="Faster models (like 'n') are recommended"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  )
469
+ video_confidence = gr.Slider(
470
+ minimum=0.1, maximum=0.9, value=0.4, step=0.05,
471
+ label="Confidence Threshold (Video)"
 
 
 
 
472
  )
473
+ video_process_interval = gr.Slider(
474
+ minimum=1, maximum=60, value=10, step=1, # Allow up to 60 frame interval
475
+ label="Processing Interval (Frames)",
476
+ info="Analyze every Nth frame (higher value = faster)"
477
+ )
478
+ video_process_btn = gr.Button("Process Video", variant="primary", elem_classes="detect-btn")
479
+
480
+ with gr.Group(elem_classes="how-to-use"):
481
+ gr.HTML('<div class="section-heading">How to Use (Video)</div>')
482
+ gr.Markdown("""
483
+ 1. Choose your input method: Upload a file or enter a URL.
484
+ 2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
485
+ 3. Click "Process Video". **Processing can take a significant amount of time.**
486
+ 4. The annotated video and summary will appear on the right when finished.
487
+ """)
488
 
489
+ # Add video examples
490
+ gr.HTML('<div class="section-heading">Example Videos</div>')
491
+ gr.HTML("""
492
+ <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
493
+ <p style="font-size: 14px; color: #4A5568; margin: 0;">
494
+ Upload any video containing objects that YOLO can detect. For testing, find sample videos
495
+ <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
496
+ </p>
497
+ </div>
498
+ """)
499
+
500
+ # Right Column: Video Results
501
+ with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
502
+ gr.HTML("""
503
+ <div class="section-heading">Video Result</div>
504
+ <details class="info-details" style="margin: 5px 0 15px 0;">
505
+ <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
506
+ 🎬 Video Processing Notes
507
+ </summary>
508
+ <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
509
+ <p style="font-size: 13px; color: #718096; margin: 0;">
510
+ The processed video includes bounding boxes around detected objects. For longer videos,
511
+ consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
512
+ </p>
513
+ </div>
514
+ </details>
515
+ """)
516
+ video_output = gr.Video(label="Processed Video", elem_classes="video-output-container") # Output for the processed video file
517
+
518
+ gr.HTML('<div class="section-heading">Processing Summary</div>')
519
+ # 使用HTML顯示影片的摘要
520
+ video_summary_text = gr.HTML(
521
+ label=None,
522
+ elem_id="video-summary-html-output"
523
+ )
524
 
525
+ gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
526
+ video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
 
 
 
527
 
528
+ # Event Listeners
529
+ # Image Model Change Handler
530
+ image_model_dropdown.change(
531
+ fn=lambda model: (model, DetectionModel.get_model_description(model)),
532
+ inputs=[image_model_dropdown],
533
+ outputs=[current_image_model, image_model_info] # Update state and description
534
  )
535
 
536
+ # Image Filter Buttons
537
+ available_classes_list = get_all_classes() # Get list of (id, name)
538
+ people_classes_ids = [0]
539
+ vehicles_classes_ids = [1, 2, 3, 4, 5, 6, 7, 8]
540
+ animals_classes_ids = list(range(14, 24))
541
+ common_objects_ids = [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73] # Bottle, cup, fork, knife, spoon, bowl, chair, couch, table, tv, laptop, phone, book
542
+
543
+ people_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids], outputs=image_class_filter)
544
+ vehicles_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids], outputs=image_class_filter)
545
+ animals_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids], outputs=image_class_filter)
546
+ objects_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids], outputs=image_class_filter)
547
+
548
+ video_input_type.change(
549
+ fn=lambda input_type: [
550
+ # Show/hide file upload
551
+ gr.update(visible=(input_type == "upload")),
552
+ # Show/hide URL input
553
+ gr.update(visible=(input_type == "url"))
554
+ ],
555
+ inputs=[video_input_type],
556
+ outputs=[video_input, video_url_input]
557
  )
558
 
559
+ # Image Processing Button Click
560
+ image_detect_btn.click(
561
+ fn=handle_image_upload,
562
+ inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter],
563
+ outputs=[
564
+ image_result_image, image_result_text, image_stats_json, image_plot_output,
565
+ image_scene_description_html, image_activities_list, image_safety_list, image_zones_json,
566
+ image_lighting_info
567
+ ]
568
  )
569
 
570
+ video_process_btn.click(
571
+ fn=handle_video_upload,
572
+ inputs=[
573
+ video_input,
574
+ video_url_input,
575
+ video_input_type,
576
+ video_model_dropdown,
577
+ video_confidence,
578
+ video_process_interval
579
+ ],
580
+ outputs=[video_output, video_summary_text, video_stats_json]
 
 
 
581
  )
582
 
 
583
  # Footer
584
  gr.HTML("""
585
+ <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
586
+ <div style="margin-bottom: 15px;">
587
+ <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP and Ultralytics • Created with Gradio</p>
588
+ </div>
589
+ <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
590
+ <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
591
+ <a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
592
+ <img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
593
+ </a>
594
+ </div>
595
+ </div>
596
+ """)
 
 
597
 
598
  return demo
599
 
600
+
601
  if __name__ == "__main__":
602
+ demo_interface = create_interface()
603
 
604
+ demo_interface.launch()
 
clip_analyzer.py CHANGED
@@ -3,6 +3,7 @@ import clip
3
  import numpy as np
4
  from PIL import Image
5
  from typing import Dict, List, Tuple, Any, Optional, Union
 
6
  from clip_prompts import (
7
  SCENE_TYPE_PROMPTS,
8
  CULTURAL_SCENE_PROMPTS,
@@ -24,7 +25,7 @@ class CLIPAnalyzer:
24
  初始化 CLIP 分析器。
25
 
26
  Args:
27
- model_name: CLIP Model name, "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
28
  device: Use GPU if it can use
29
  """
30
  # 自動選擇設備
 
3
  import numpy as np
4
  from PIL import Image
5
  from typing import Dict, List, Tuple, Any, Optional, Union
6
+
7
  from clip_prompts import (
8
  SCENE_TYPE_PROMPTS,
9
  CULTURAL_SCENE_PROMPTS,
 
25
  初始化 CLIP 分析器。
26
 
27
  Args:
28
+ model_name: CLIP Model name, "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
29
  device: Use GPU if it can use
30
  """
31
  # 自動選擇設備
enhance_scene_describer.py CHANGED
@@ -126,7 +126,7 @@ class EnhancedSceneDescriber:
126
  }
127
  }
128
 
129
- # 文化模板
130
  if "cultural_templates" not in templates:
131
  templates["cultural_templates"] = {
132
  "asian": {
@@ -164,8 +164,8 @@ class EnhancedSceneDescriber:
164
  "elevated_threshold": 0.6, # Objects mostly in middle/bottom
165
  "elevated_top_threshold": 0.3 # Few objects at top of frame
166
  }
167
-
168
-
169
  def generate_description(self,
170
  scene_type: str,
171
  detected_objects: List[Dict],
@@ -190,26 +190,23 @@ class EnhancedSceneDescriber:
190
  """
191
  # Handle unknown scene type or very low confidence
192
  if scene_type == "unknown" or confidence < 0.4:
193
- return self._generate_generic_description(detected_objects, lighting_info)
194
 
195
  # Detect viewpoint
196
- viewpoint = self._detect_viewpoint(detected_objects)
197
 
 
198
  if viewpoint == "aerial":
199
- # 如果是十字路口相關的場景,確保使用正確的空中視角十字路口場景類型
200
  if "intersection" in scene_type or self._is_intersection(detected_objects):
201
  scene_type = "aerial_view_intersection"
202
- # 如果是商業區相關的場景
203
  elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
204
  scene_type = "aerial_view_commercial_area"
205
- # 如果是廣場相關的場景
206
  elif any(keyword in scene_type for keyword in ["plaza", "square"]):
207
  scene_type = "aerial_view_plaza"
208
- # 其他空中視角場景,預設使用十字路口
209
  else:
210
  scene_type = "aerial_view_intersection"
211
 
212
- # Detect cultural context - 只有在非空中視角時才檢測文化上下文
213
  cultural_context = None
214
  if viewpoint != "aerial":
215
  cultural_context = self._detect_cultural_context(scene_type, detected_objects)
@@ -224,7 +221,6 @@ class EnhancedSceneDescriber:
224
 
225
  # Get base description for the scene type
226
  if viewpoint == "aerial":
227
- # 空中視角時使用已設定的基本描述
228
  if 'base_description' not in locals():
229
  base_description = "An aerial view showing the layout and movement patterns from above"
230
  elif scene_type in self.scene_types:
@@ -240,25 +236,38 @@ class EnhancedSceneDescriber:
240
  viewpoint
241
  )
242
 
243
- # 修正:根據人數改進描述
244
- people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
 
 
 
 
 
 
 
 
 
245
  if people_objs:
246
  people_count = len(people_objs)
247
  if people_count > 5:
248
- # 當人數很多時,用更精確的措辭
249
  people_phrase = f"numerous people ({people_count})"
250
  else:
251
  people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
252
 
253
- # 將人數信息加入到場景詳情中
254
- if "people" not in scene_details.lower() and "pedestrian" not in scene_details.lower():
255
- scene_details += f" The scene includes {people_phrase}."
256
 
257
- # Apply cultural context if detected (只在非空中視角時應用)
258
- if cultural_context and scene_details and viewpoint != "aerial":
259
  cultural_elements = self._generate_cultural_elements(cultural_context)
260
  if cultural_elements:
261
- scene_details += f" {cultural_elements}"
 
 
 
 
 
262
 
263
  # Include lighting information if available
264
  lighting_description = ""
@@ -267,22 +276,25 @@ class EnhancedSceneDescriber:
267
  if lighting_type in self.templates.get("lighting_templates", {}):
268
  lighting_description = self.templates["lighting_templates"][lighting_type]
269
 
270
- # Apply confidence template
271
- description_template = self.templates["confidence_templates"].get(
272
- confidence_level, "{description} {details}"
273
- )
274
-
275
- # Fill the template
276
- description = description_template.format(
277
- description=base_description,
278
- details=scene_details
279
- )
280
 
281
- # Add viewpoint observation if viewpoint is not standard
282
  if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
283
  viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
284
 
285
- # 在空中視角時,確保觀察描述反映更多細節
 
 
 
 
 
 
 
 
 
 
286
  if viewpoint == "aerial":
287
  scene_elements = "the crossing patterns and pedestrian movement"
288
  else:
@@ -292,93 +304,269 @@ class EnhancedSceneDescriber:
292
  scene_elements=scene_elements
293
  )
294
 
295
- # Add viewpoint prefix if needed
296
- if not description.startswith(viewpoint_template.get("prefix", "")):
297
- description = f"{viewpoint_template.get('prefix', '')}{description}"
298
-
299
  # Add viewpoint observation if not already included
300
- if viewpoint_desc not in description:
301
- description += f" {viewpoint_desc}"
302
-
303
- # Add lighting description if available
304
- if lighting_description and lighting_description not in description:
305
- description += f" {lighting_description}"
306
 
307
  # Add information about functional zones if available
308
  if functional_zones and len(functional_zones) > 0:
309
  zones_desc = self._describe_functional_zones(functional_zones)
310
  if zones_desc:
311
- description += f" {zones_desc}"
312
 
313
- # 計算真實的人數
314
  people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
315
 
316
- # 檢查描述中是否有人數信息的矛盾
317
  if people_count > 5:
318
- # 識別可能含有較小人數信息的片段
319
  small_people_patterns = [
320
  r"Area with \d+ people\.",
321
  r"Area with \d+ person\.",
322
  r"with \d+ people",
323
  r"with \d+ person"
324
  ]
325
- # 對每個模式檢查並移除
 
326
  filtered_description = description
327
  for pattern in small_people_patterns:
328
  matches = re.findall(pattern, filtered_description)
329
  for match in matches:
330
- # 從匹配中提取人數
331
  number_match = re.search(r'\d+', match)
332
  if number_match:
333
  try:
334
  people_mentioned = int(number_match.group())
335
- # 如果提到的人數小於總人數,移除整個句子
336
  if people_mentioned < people_count:
337
- # 將描述分割成句子
338
  sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
339
- # 移除包含匹配片段的句子
340
  filtered_sentences = []
341
  for sentence in sentences:
342
  if match not in sentence:
343
  filtered_sentences.append(sentence)
344
- # 重新組合描述
345
  filtered_description = " ".join(filtered_sentences)
346
  except ValueError:
347
- # 數字轉換失敗,繼續處理
348
  continue
349
 
350
- # 使用過濾後的描述
351
  description = filtered_description
352
 
 
 
 
353
  return description
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  def _is_intersection(self, detected_objects: List[Dict]) -> bool:
356
  """
357
  通過分析物體分佈來判斷場景是否為十字路口
358
  """
359
  # 檢查行人分佈模式
360
  pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
361
-
362
  if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口
363
  # 抓取行人位置
364
  positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
365
-
366
  # 分析 x 和 y 坐標分佈
367
  x_coords = [pos[0] for pos in positions]
368
  y_coords = [pos[1] for pos in positions]
369
-
370
  # 計算 x 和 y 坐標的變異數
371
  x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
372
  y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
373
-
374
  # 計算範圍
375
  x_range = max(x_coords) - min(x_coords)
376
  y_range = max(y_coords) - min(y_coords)
377
-
378
  # 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口
379
  if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
380
  return True
381
-
382
  return False
383
 
384
  def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
@@ -1165,27 +1353,27 @@ class EnhancedSceneDescriber:
1165
  優化物品描述,避免重複列舉相同物品
1166
  """
1167
  import re
1168
-
1169
  # 處理床鋪重複描述
1170
  if "bed in the room" in description:
1171
  description = description.replace("a bed in the room", "a bed")
1172
-
1173
  # 處理重複的物品列表
1174
  # 尋找格式如 "item, item, item" 的模式
1175
  object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
1176
-
1177
  for obj_list in object_lists:
1178
  # 計算每個物品出現次數
1179
  items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
1180
  item_counts = {}
1181
-
1182
  for item in items:
1183
  item = item.strip()
1184
  if item and item not in ["and", "with"]:
1185
  if item not in item_counts:
1186
  item_counts[item] = 0
1187
  item_counts[item] += 1
1188
-
1189
  # 生成優化後的物品列表
1190
  if item_counts:
1191
  new_items = []
@@ -1194,7 +1382,7 @@ class EnhancedSceneDescriber:
1194
  new_items.append(f"{count} {item}s")
1195
  else:
1196
  new_items.append(item)
1197
-
1198
  # 格式化新列表
1199
  if len(new_items) == 1:
1200
  new_list = new_items[0]
@@ -1202,10 +1390,10 @@ class EnhancedSceneDescriber:
1202
  new_list = f"{new_items[0]} and {new_items[1]}"
1203
  else:
1204
  new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
1205
-
1206
  # 替換原始列表
1207
  description = description.replace(obj_list, new_list)
1208
-
1209
  return description
1210
 
1211
  def _describe_functional_zones(self, functional_zones: Dict) -> str:
@@ -1288,7 +1476,7 @@ class EnhancedSceneDescriber:
1288
 
1289
  # 根據處理後的區域數量生成最終描述
1290
  final_desc = ""
1291
-
1292
  if len(processed_zones) == 1:
1293
  _, zone_info = processed_zones[0]
1294
  zone_desc = zone_info["description"]
 
126
  }
127
  }
128
 
129
+ # 文化模板
130
  if "cultural_templates" not in templates:
131
  templates["cultural_templates"] = {
132
  "asian": {
 
164
  "elevated_threshold": 0.6, # Objects mostly in middle/bottom
165
  "elevated_top_threshold": 0.3 # Few objects at top of frame
166
  }
167
+
168
+
169
  def generate_description(self,
170
  scene_type: str,
171
  detected_objects: List[Dict],
 
190
  """
191
  # Handle unknown scene type or very low confidence
192
  if scene_type == "unknown" or confidence < 0.4:
193
+ return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
194
 
195
  # Detect viewpoint
196
+ viewpoint = self._detect_viewpoint(detected_objects)
197
 
198
+ # Process aerial viewpoint scene types
199
  if viewpoint == "aerial":
 
200
  if "intersection" in scene_type or self._is_intersection(detected_objects):
201
  scene_type = "aerial_view_intersection"
 
202
  elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
203
  scene_type = "aerial_view_commercial_area"
 
204
  elif any(keyword in scene_type for keyword in ["plaza", "square"]):
205
  scene_type = "aerial_view_plaza"
 
206
  else:
207
  scene_type = "aerial_view_intersection"
208
 
209
+ # Detect cultural context - only for non-aerial viewpoints
210
  cultural_context = None
211
  if viewpoint != "aerial":
212
  cultural_context = self._detect_cultural_context(scene_type, detected_objects)
 
221
 
222
  # Get base description for the scene type
223
  if viewpoint == "aerial":
 
224
  if 'base_description' not in locals():
225
  base_description = "An aerial view showing the layout and movement patterns from above"
226
  elif scene_type in self.scene_types:
 
236
  viewpoint
237
  )
238
 
239
+ # Start with the base description
240
+ description = base_description
241
+
242
+ # If there's a secondary description from the scene type template, append it properly
243
+ if scene_type in self.scene_types and "secondary_description" in self.scene_types[scene_type]:
244
+ secondary_desc = self.scene_types[scene_type]["secondary_description"]
245
+ if secondary_desc:
246
+ description = self._smart_append(description, secondary_desc)
247
+
248
+ # Improve description based on people count
249
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # Person class
250
  if people_objs:
251
  people_count = len(people_objs)
252
  if people_count > 5:
 
253
  people_phrase = f"numerous people ({people_count})"
254
  else:
255
  people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
256
 
257
+ # Add people information to the scene details if not already mentioned
258
+ if "people" not in description.lower() and "pedestrian" not in description.lower():
259
+ description = self._smart_append(description, f"The scene includes {people_phrase}")
260
 
261
+ # Apply cultural context if detected (only for non-aerial viewpoints)
262
+ if cultural_context and viewpoint != "aerial":
263
  cultural_elements = self._generate_cultural_elements(cultural_context)
264
  if cultural_elements:
265
+ description = self._smart_append(description, cultural_elements)
266
+
267
+ # Now append the detailed scene information if available
268
+ if scene_details:
269
+ # Use smart_append to ensure proper formatting between base description and details
270
+ description = self._smart_append(description, scene_details)
271
 
272
  # Include lighting information if available
273
  lighting_description = ""
 
276
  if lighting_type in self.templates.get("lighting_templates", {}):
277
  lighting_description = self.templates["lighting_templates"][lighting_type]
278
 
279
+ # Add lighting description if available
280
+ if lighting_description and lighting_description not in description:
281
+ description = self._smart_append(description, lighting_description)
 
 
 
 
 
 
 
282
 
283
+ # Process viewpoint information
284
  if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
285
  viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
286
 
287
+ # Special handling for viewpoint prefix
288
+ prefix = viewpoint_template.get('prefix', '')
289
+ if prefix and not description.startswith(prefix):
290
+ # Prefix is a phrase like "From above, " that should precede the description
291
+ if description and description[0].isupper():
292
+ # Maintain the flow by lowercasing the first letter after the prefix
293
+ description = prefix + description[0].lower() + description[1:]
294
+ else:
295
+ description = prefix + description
296
+
297
+ # Get appropriate scene elements description based on viewpoint
298
  if viewpoint == "aerial":
299
  scene_elements = "the crossing patterns and pedestrian movement"
300
  else:
 
304
  scene_elements=scene_elements
305
  )
306
 
 
 
 
 
307
  # Add viewpoint observation if not already included
308
+ if viewpoint_desc and viewpoint_desc not in description:
309
+ description = self._smart_append(description, viewpoint_desc)
 
 
 
 
310
 
311
  # Add information about functional zones if available
312
  if functional_zones and len(functional_zones) > 0:
313
  zones_desc = self._describe_functional_zones(functional_zones)
314
  if zones_desc:
315
+ description = self._smart_append(description, zones_desc)
316
 
317
+ # Calculate actual people count
318
  people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
319
 
320
+ # Check for inconsistencies in people count descriptions
321
  if people_count > 5:
322
+ # Identify fragments that might contain smaller people counts
323
  small_people_patterns = [
324
  r"Area with \d+ people\.",
325
  r"Area with \d+ person\.",
326
  r"with \d+ people",
327
  r"with \d+ person"
328
  ]
329
+
330
+ # Check and remove each pattern
331
  filtered_description = description
332
  for pattern in small_people_patterns:
333
  matches = re.findall(pattern, filtered_description)
334
  for match in matches:
335
+ # Extract the number from the match
336
  number_match = re.search(r'\d+', match)
337
  if number_match:
338
  try:
339
  people_mentioned = int(number_match.group())
340
+ # If the mentioned count is less than total, remove the entire sentence
341
  if people_mentioned < people_count:
342
+ # Split description into sentences
343
  sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
344
+ # Remove sentences containing the match
345
  filtered_sentences = []
346
  for sentence in sentences:
347
  if match not in sentence:
348
  filtered_sentences.append(sentence)
349
+ # Recombine the description
350
  filtered_description = " ".join(filtered_sentences)
351
  except ValueError:
352
+ # Failed number conversion, continue processing
353
  continue
354
 
355
+ # Use the filtered description
356
  description = filtered_description
357
 
358
+ # Final formatting to ensure correct punctuation and capitalization
359
+ description = self._format_final_description(description)
360
+
361
  return description
362
 
363
+ def _smart_append(self, current_text: str, new_fragment: str) -> str:
364
+ """
365
+ Intelligently append a new text fragment to the current text,
366
+ handling punctuation and capitalization correctly.
367
+
368
+ Args:
369
+ current_text: The existing text to append to
370
+ new_fragment: The new text fragment to append
371
+
372
+ Returns:
373
+ str: The combined text with proper formatting
374
+ """
375
+ # Handle empty cases
376
+ if not new_fragment:
377
+ return current_text
378
+
379
+ if not current_text:
380
+ # Ensure first character is uppercase for the first fragment
381
+ return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
382
+
383
+ # Clean up existing text
384
+ current_text = current_text.rstrip()
385
+
386
+ # Check for ending punctuation
387
+ ends_with_sentence = current_text.endswith(('.', '!', '?'))
388
+ ends_with_comma = current_text.endswith(',')
389
+
390
+ # Specifically handle the "A xxx A yyy" pattern that's causing issues
391
+ if (current_text.startswith("A ") or current_text.startswith("An ")) and \
392
+ (new_fragment.startswith("A ") or new_fragment.startswith("An ")):
393
+ return current_text + ". " + new_fragment
394
+
395
+ # Decide how to join the texts
396
+ if ends_with_sentence:
397
+ # After a sentence, start with uppercase and add proper spacing
398
+ joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
399
+ elif ends_with_comma:
400
+ # After a comma, maintain flow with lowercase unless it's a proper noun or special case
401
+ if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
402
+ joined_text = current_text + " " + new_fragment
403
+ else:
404
+ joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
405
+ elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower():
406
+ # When adding a new sentence about the scene, use a period
407
+ joined_text = current_text + ". " + new_fragment
408
+ else:
409
+ # For other cases, decide based on the content
410
+ if self._is_related_phrases(current_text, new_fragment):
411
+ if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
412
+ joined_text = current_text + ", " + new_fragment
413
+ else:
414
+ joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
415
+ else:
416
+ # Use period for unrelated phrases
417
+ joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
418
+
419
+ return joined_text
420
+
421
+ def _is_related_phrases(self, text1: str, text2: str) -> bool:
422
+ """
423
+ Determine if two phrases are related and should be connected with a comma
424
+ rather than separated with a period.
425
+
426
+ Args:
427
+ text1: The first text fragment
428
+ text2: The second text fragment to be appended
429
+
430
+ Returns:
431
+ bool: Whether the phrases appear to be related
432
+ """
433
+ # Check if either phrase starts with "A" or "An" - these are likely separate descriptions
434
+ if (text1.startswith("A ") or text1.startswith("An ")) and \
435
+ (text2.startswith("A ") or text2.startswith("An ")):
436
+ return False # These are separate descriptions, not related phrases
437
+
438
+ # Check if the second phrase starts with a connecting word
439
+ connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
440
+ "this", "these", "that", "those", "and", "or", "but"]
441
+
442
+ first_word = text2.split()[0].lower() if text2 else ""
443
+ if first_word in connecting_words:
444
+ return True
445
+
446
+ # Check if the first phrase ends with something that suggests continuity
447
+ ending_patterns = ["such as", "including", "like", "especially", "particularly",
448
+ "for example", "for instance", "namely", "specifically"]
449
+
450
+ for pattern in ending_patterns:
451
+ if text1.lower().endswith(pattern):
452
+ return True
453
+
454
+ # Check if both phrases are about the scene
455
+ if "scene" in text1.lower() and "scene" in text2.lower():
456
+ return False # Separate statements about the scene should be separate sentences
457
+
458
+ return False
459
+
460
+ def _format_final_description(self, text: str) -> str:
461
+ """
462
+ Format the final description text to ensure correct punctuation,
463
+ capitalization, and spacing.
464
+
465
+ Args:
466
+ text: The text to format
467
+
468
+ Returns:
469
+ str: The properly formatted text
470
+ """
471
+ import re
472
+
473
+ if not text:
474
+ return ""
475
+
476
+ # 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
477
+ text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
478
+ text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
479
+
480
+ # 2. 確保第一個字母大寫
481
+ text = text[0].upper() + text[1:] if text else ""
482
+
483
+ # 3. 修正詞之間的空格問題
484
+ text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個
485
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格
486
+
487
+ # 4. 修正詞連接問題
488
+ text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格
489
+ text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
490
+ text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
491
+
492
+ # 5. 修正標點符號後的大小寫問題
493
+ text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫
494
+
495
+ # 6. 修正逗號後接大寫單詞的問題
496
+ def fix_capitalization_after_comma(match):
497
+ word = match.group(2)
498
+ # 例外情況:保留專有名詞、人稱代詞等的大寫
499
+ if word in ["I", "I'm", "I've", "I'd", "I'll"]:
500
+ return match.group(0) # 保持原樣
501
+
502
+ # 保留月份、星期、地名等專有名詞的大寫
503
+ proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
504
+ "August", "September", "October", "November", "December",
505
+ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
506
+ if word in proper_nouns:
507
+ return match.group(0) # 保持原樣
508
+
509
+ # 其他情況:將首字母改為小寫
510
+ return match.group(1) + word[0].lower() + word[1:]
511
+
512
+ # 匹配逗號後接空格再接大寫單詞的模式
513
+ text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
514
+
515
+
516
+ common_phrases = [
517
+ (r'Social or seating area', r'social or seating area'),
518
+ (r'Sleeping area', r'sleeping area'),
519
+ (r'Dining area', r'dining area'),
520
+ (r'Living space', r'living space')
521
+ ]
522
+
523
+ for phrase, replacement in common_phrases:
524
+ # 只修改句中的術語,保留句首的大寫
525
+ text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
526
+ # 修改句中的術語,但保留句首的大寫
527
+ text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
528
+
529
+ # 7. 確保標點符號後有空格
530
+ text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格
531
+ text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格
532
+
533
+ # 8. 修正重複標點符號
534
+ text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個
535
+ text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
536
+
537
+ # 9. 確保文本以標點結束
538
+ if text and not text[-1] in '.!?':
539
+ text += '.'
540
+
541
+ return text
542
+
543
  def _is_intersection(self, detected_objects: List[Dict]) -> bool:
544
  """
545
  通過分析物體分佈來判斷場景是否為十字路口
546
  """
547
  # 檢查行人分佈模式
548
  pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
549
+
550
  if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口
551
  # 抓取行人位置
552
  positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
553
+
554
  # 分析 x 和 y 坐標分佈
555
  x_coords = [pos[0] for pos in positions]
556
  y_coords = [pos[1] for pos in positions]
557
+
558
  # 計算 x 和 y 坐標的變異數
559
  x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
560
  y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
561
+
562
  # 計算範圍
563
  x_range = max(x_coords) - min(x_coords)
564
  y_range = max(y_coords) - min(y_coords)
565
+
566
  # 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口
567
  if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
568
  return True
569
+
570
  return False
571
 
572
  def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
 
1353
  優化物品描述,避免重複列舉相同物品
1354
  """
1355
  import re
1356
+
1357
  # 處理床鋪重複描述
1358
  if "bed in the room" in description:
1359
  description = description.replace("a bed in the room", "a bed")
1360
+
1361
  # 處理重複的物品列表
1362
  # 尋找格式如 "item, item, item" 的模式
1363
  object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
1364
+
1365
  for obj_list in object_lists:
1366
  # 計算每個物品出現次數
1367
  items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
1368
  item_counts = {}
1369
+
1370
  for item in items:
1371
  item = item.strip()
1372
  if item and item not in ["and", "with"]:
1373
  if item not in item_counts:
1374
  item_counts[item] = 0
1375
  item_counts[item] += 1
1376
+
1377
  # 生成優化後的物品列表
1378
  if item_counts:
1379
  new_items = []
 
1382
  new_items.append(f"{count} {item}s")
1383
  else:
1384
  new_items.append(item)
1385
+
1386
  # 格式化新列表
1387
  if len(new_items) == 1:
1388
  new_list = new_items[0]
 
1390
  new_list = f"{new_items[0]} and {new_items[1]}"
1391
  else:
1392
  new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
1393
+
1394
  # 替換原始列表
1395
  description = description.replace(obj_list, new_list)
1396
+
1397
  return description
1398
 
1399
  def _describe_functional_zones(self, functional_zones: Dict) -> str:
 
1476
 
1477
  # 根據處理後的區域數量生成最終描述
1478
  final_desc = ""
1479
+
1480
  if len(processed_zones) == 1:
1481
  _, zone_info = processed_zones[0]
1482
  zone_desc = zone_info["description"]
lighting_analyzer.py CHANGED
@@ -151,11 +151,11 @@ class LightingAnalyzer:
151
 
152
  avg_saturation = np.mean(s_channel)
153
 
154
- # 天空亮度
155
  upper_half = v_channel[:height//2, :]
156
  sky_brightness = np.mean(upper_half)
157
 
158
- # 色調分析
159
  warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
160
  warm_ratio = np.sum(warm_colors) / (height * width)
161
 
@@ -186,16 +186,16 @@ class LightingAnalyzer:
186
  top_scale = scale_factor * 2 # 更積極的下採樣
187
  top_region = v_channel[:height//4:top_scale, ::top_scale]
188
  top_region_std = np.std(top_region)
189
- ceiling_uniformity = 1.0 - min(1.0, top_region_std / max(np.mean(top_region), 1e-5))
190
 
191
  # 使用更簡單的方法檢測上部水平線
192
  top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
193
  horizontal_lines_strength = np.mean(top_gradients)
194
  # 標準化
195
- horizontal_line_ratio = min(1.0, horizontal_lines_strength / 40)
196
 
197
  # 極簡的亮點檢測
198
- sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
199
  light_threshold = min(220, avg_brightness + 2*brightness_std)
200
  is_bright = sampled_v > light_threshold
201
  bright_spot_count = np.sum(is_bright)
@@ -203,7 +203,7 @@ class LightingAnalyzer:
203
  # 圓形光源分析的簡化替代方法
204
  circular_light_score = 0
205
  indoor_light_score = 0
206
- light_distribution_uniformity = 0.5
207
 
208
  # 只有當檢測到亮點,且不是大量亮點時(可能是室外光反射)才進行光源分析
209
  if 1 < bright_spot_count < 20:
@@ -227,7 +227,7 @@ class LightingAnalyzer:
227
  indoor_light_score = 0.3
228
 
229
  # 使用邊緣區域梯度來快速估計邊界
230
- edge_scale = scale_factor * 2
231
 
232
  # 只採樣圖像邊緣部分進行分析
233
  left_edge = small_gray[:, :small_gray.shape[1]//6]
@@ -240,15 +240,15 @@ class LightingAnalyzer:
240
  top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
241
 
242
  # 標準化
243
- left_edge_density = min(1.0, left_gradient / 50.0)
244
- right_edge_density = min(1.0, right_gradient / 50.0)
245
- top_edge_density = min(1.0, top_gradient / 50.0)
246
 
247
  # 封閉環境通常在圖像邊緣有較強的梯度
248
  boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
249
 
250
  # 簡單估計整體邊緣密度
251
- edges_density = min(1.0, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100.0)
252
 
253
  street_line_score = 0
254
 
@@ -319,16 +319,16 @@ class LightingAnalyzer:
319
  # 1. 藍色區域(天空)特徵 - 藍色區域多通常表示室外
320
  if features.get("blue_ratio", 0) > 0.2:
321
  # 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
322
- if (features.get("ceiling_uniformity", 0) > 0.5 or
323
- features.get("boundary_edge_score", 0) > 0.3 or
324
  features.get("indoor_light_score", 0) > 0.2 or
325
  features.get("bright_spot_count", 0) > 0):
326
- blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
327
  else:
328
  blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
329
  else:
330
  blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
331
-
332
  indoor_score += blue_score
333
  feature_contributions["blue_ratio"] = blue_score
334
 
@@ -351,14 +351,14 @@ class LightingAnalyzer:
351
  horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
352
 
353
  # 增強天花板檢測的影響
354
- if ceiling_uniformity > 0.5:
355
- ceiling_weight = 3
356
- ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
357
  if horizontal_line_ratio > 0.2: # 如果有水平線條,進一步增強
358
- ceiling_contribution *= 1.5
359
- elif ceiling_uniformity > 0.4:
360
- ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
361
-
362
  indoor_score += ceiling_contribution
363
  feature_contributions["ceiling_features"] = ceiling_contribution
364
 
@@ -370,7 +370,7 @@ class LightingAnalyzer:
370
 
371
  # 加強對特定類型光源的檢測
372
  if circular_light_count >= 1: # 即便只有一個圓形光源也很可能是室內
373
- light_contribution = weights.get("light_features", 1.2) * 2.0
374
  elif indoor_light_score > 0.3:
375
  light_contribution = weights.get("light_features", 1.2) * 1.0
376
 
@@ -384,11 +384,11 @@ class LightingAnalyzer:
384
  edges_density = features.get("edges_density", 0)
385
 
386
  # 高邊界評分暗示封閉環境(室內)
387
- if boundary_edge_score > 0.3:
388
- boundary_contribution = weights.get("boundary_features", 1.2) * 2
389
- elif boundary_edge_score > 0.2:
390
- boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
391
-
392
  indoor_score += boundary_contribution
393
  feature_contributions["boundary_features"] = boundary_contribution
394
 
@@ -415,7 +415,7 @@ class LightingAnalyzer:
415
  combined_uniformity = (features["brightness_uniformity"] +
416
  features.get("ceiling_uniformity", 0)) / 2
417
 
418
- if combined_uniformity > 0.5:
419
  gradient_contribution = weights["gradient_ratio"] * 0.7
420
  else:
421
  gradient_contribution = -weights["gradient_ratio"] * 0.3
@@ -430,7 +430,7 @@ class LightingAnalyzer:
430
 
431
  # 調整亮點分析邏輯
432
  if circular_light_count >= 1: # 即使只有一個圓形光源
433
- bright_spot_contribution = weights["bright_spots"] * 1.5
434
  elif bright_spot_count < 5: # 適當放寬閾值
435
  bright_spot_contribution = weights["bright_spots"] * 0.5
436
  elif bright_spot_count > 15: # 大量亮點比較有可能為室外
@@ -441,8 +441,8 @@ class LightingAnalyzer:
441
 
442
  # 8. 色調分析
443
  yellow_contribution = 0
444
- if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
445
- if features.get("indoor_light_score", 0) > 0.2:
446
  yellow_contribution = weights["color_tone"] * 0.8
447
  else:
448
  yellow_contribution = weights["color_tone"] * 0.5
@@ -452,10 +452,10 @@ class LightingAnalyzer:
452
 
453
  if features.get("blue_ratio", 0) > 0.7:
454
  # 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
455
- if (features.get("ceiling_uniformity", 0) > 0.6 or
456
- features.get("boundary_edge_score", 0) > 0.3 or
457
  features.get("indoor_light_score", 0) > 0):
458
- blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
459
  else:
460
  blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
461
  else:
@@ -534,19 +534,19 @@ class LightingAnalyzer:
534
  # 1: 窗戶和牆壁形成的直角
535
  if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
536
  bedroom_indicators += 1.5 # 增加權重
537
-
538
  # 2: 天花板和光源
539
  if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
540
- bedroom_indicators += 2.5
541
-
542
  # 3: 良好對比度的牆壁顏色,適合臥房還有客廳
543
  if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
544
- bedroom_indicators += 1.5
545
-
546
  # 特殊的檢測 4: 檢測窗戶
547
  if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
548
- bedroom_indicators += 1.5
549
-
550
  # 如果滿足足夠的家居指標,提高多點室內判斷分數
551
  if bedroom_indicators >= 3:
552
  # 增加��居環境評分
@@ -576,11 +576,11 @@ class LightingAnalyzer:
576
  def _determine_lighting_conditions(self, features, is_indoor):
577
  """
578
  基於特徵和室內/室外判斷確定光照條件。
579
-
580
  Args:
581
  features: 特徵字典
582
  is_indoor: 是否是室內環境
583
-
584
  Returns:
585
  Dict: 光照條件分析結果
586
  """
@@ -588,37 +588,37 @@ class LightingAnalyzer:
588
  time_of_day = "unknown"
589
  confidence = 0.5
590
  diagnostics = {}
591
-
592
  avg_brightness = features["avg_brightness"]
593
  dark_pixel_ratio = features["dark_pixel_ratio"]
594
  yellow_orange_ratio = features["yellow_orange_ratio"]
595
  blue_ratio = features["blue_ratio"]
596
  gray_ratio = features["gray_ratio"]
597
-
598
  # 基於室內/室外分別判斷
599
  if is_indoor:
600
  # 計算室內住宅自然光指標
601
  natural_window_light = 0
602
-
603
  # 檢查窗戶特徵和光線特性
604
- if (features.get("blue_ratio", 0) > 0.1 and
605
  features.get("sky_brightness", 0) > avg_brightness * 1.1):
606
  natural_window_light += 1
607
-
608
  # 檢查均勻柔和的光線分布
609
- if (features.get("brightness_uniformity", 0) > 0.65 and
610
  features.get("brightness_std", 0) < 70):
611
  natural_window_light += 1
612
-
613
  # 檢查暖色調比例
614
  if features.get("warm_ratio", 0) > 0.2:
615
  natural_window_light += 1
616
-
617
  # 家居環境指標
618
  home_env_score = features.get("home_environment_pattern", 0)
619
  if home_env_score > 1.5:
620
  natural_window_light += 1
621
-
622
  # 1. 室內明亮環境,可能有窗戶自然光
623
  if avg_brightness > 130:
624
  # 檢測自然光住宅空間 - 新增類型!
@@ -645,7 +645,7 @@ class LightingAnalyzer:
645
  time_of_day = "indoor_dim"
646
  confidence = 0.65 + dark_pixel_ratio / 3
647
  diagnostics["reason"] = "Low brightness in indoor environment"
648
-
649
  # 1. 檢測設計師風格住宅,可以偵測到比較多種類的狀況
650
  designer_residential_score = 0
651
  # 檢測特色燈具
@@ -660,19 +660,19 @@ class LightingAnalyzer:
660
  # 檢測家居環境特徵
661
  if home_env_score > 1.5:
662
  designer_residential_score += 1
663
-
664
  if designer_residential_score >= 3 and home_env_score > 1.5:
665
- time_of_day = "indoor_designer_residential"
666
  confidence = 0.85
667
  diagnostics["special_case"] = "Designer residential lighting with decorative elements"
668
-
669
  # 2. 檢測餐廳/酒吧場景
670
  elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
671
  if features["warm_ratio"] > 0.4:
672
  time_of_day = "indoor_restaurant"
673
  confidence = 0.65 + yellow_orange_ratio / 4
674
  diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
675
-
676
  # 3. 檢測商業照明空間
677
  elif avg_brightness > 120 and features["bright_spot_count"] > 4:
678
  # 增加商業照明判別的精確度
@@ -685,7 +685,7 @@ class LightingAnalyzer:
685
  # 整體照明結構化布局
686
  if features.get("light_distribution_uniformity", 0) > 0.6:
687
  commercial_score += 0.5
688
-
689
  if commercial_score > 0.6 and designer_residential_score < 3:
690
  time_of_day = "indoor_commercial"
691
  confidence = 0.7 + commercial_score / 5
@@ -794,18 +794,18 @@ class LightingAnalyzer:
794
  """
795
  return {
796
  "indoor_outdoor_weights": {
797
- "blue_ratio": 0.6,
798
- "brightness_uniformity": 1.2,
799
- "gradient_ratio": 0.7,
800
- "bright_spots": 0.8,
801
- "color_tone": 0.5,
802
- "sky_brightness": 0.9,
803
- "brightness_variation": 0.7,
804
- "ceiling_features": 1.5,
805
- "light_features": 1.1,
806
- "boundary_features": 2.8,
807
- "street_features": 2.0,
808
- "building_features": 1.6
809
  },
810
  "include_diagnostics": True
811
  }
 
151
 
152
  avg_saturation = np.mean(s_channel)
153
 
154
+ # 天空亮度
155
  upper_half = v_channel[:height//2, :]
156
  sky_brightness = np.mean(upper_half)
157
 
158
+ # 色調分析
159
  warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
160
  warm_ratio = np.sum(warm_colors) / (height * width)
161
 
 
186
  top_scale = scale_factor * 2 # 更積極的下採樣
187
  top_region = v_channel[:height//4:top_scale, ::top_scale]
188
  top_region_std = np.std(top_region)
189
+ ceiling_uniformity = 1.0 - min(1, top_region_std / max(np.mean(top_region), 1e-5))
190
 
191
  # 使用更簡單的方法檢測上部水平線
192
  top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
193
  horizontal_lines_strength = np.mean(top_gradients)
194
  # 標準化
195
+ horizontal_line_ratio = min(1, horizontal_lines_strength / 40)
196
 
197
  # 極簡的亮點檢測
198
+ sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
199
  light_threshold = min(220, avg_brightness + 2*brightness_std)
200
  is_bright = sampled_v > light_threshold
201
  bright_spot_count = np.sum(is_bright)
 
203
  # 圓形光源分析的簡化替代方法
204
  circular_light_score = 0
205
  indoor_light_score = 0
206
+ light_distribution_uniformity = 0.5
207
 
208
  # 只有當檢測到亮點,且不是大量亮點時(可能是室外光反射)才進行光源分析
209
  if 1 < bright_spot_count < 20:
 
227
  indoor_light_score = 0.3
228
 
229
  # 使用邊緣區域梯度來快速估計邊界
230
+ edge_scale = scale_factor * 2
231
 
232
  # 只採樣圖像邊緣部分進行分析
233
  left_edge = small_gray[:, :small_gray.shape[1]//6]
 
240
  top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
241
 
242
  # 標準化
243
+ left_edge_density = min(1.0, left_gradient / 50)
244
+ right_edge_density = min(1.0, right_gradient / 50)
245
+ top_edge_density = min(1.0, top_gradient / 50)
246
 
247
  # 封閉環境通常在圖像邊緣有較強的梯度
248
  boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
249
 
250
  # 簡單估計整體邊緣密度
251
+ edges_density = min(1, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100)
252
 
253
  street_line_score = 0
254
 
 
319
  # 1. 藍色區域(天空)特徵 - 藍色區域多通常表示室外
320
  if features.get("blue_ratio", 0) > 0.2:
321
  # 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
322
+ if (features.get("ceiling_uniformity", 0) > 0.5 or
323
+ features.get("boundary_edge_score", 0) > 0.3 or
324
  features.get("indoor_light_score", 0) > 0.2 or
325
  features.get("bright_spot_count", 0) > 0):
326
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
327
  else:
328
  blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
329
  else:
330
  blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
331
+
332
  indoor_score += blue_score
333
  feature_contributions["blue_ratio"] = blue_score
334
 
 
351
  horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
352
 
353
  # 增強天花板檢測的影響
354
+ if ceiling_uniformity > 0.5:
355
+ ceiling_weight = 3
356
+ ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
357
  if horizontal_line_ratio > 0.2: # 如果有水平線條,進一步增強
358
+ ceiling_contribution *= 1.5
359
+ elif ceiling_uniformity > 0.4:
360
+ ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
361
+
362
  indoor_score += ceiling_contribution
363
  feature_contributions["ceiling_features"] = ceiling_contribution
364
 
 
370
 
371
  # 加強對特定類型光源的檢測
372
  if circular_light_count >= 1: # 即便只有一個圓形光源也很可能是室內
373
+ light_contribution = weights.get("light_features", 1.2) * 2.0
374
  elif indoor_light_score > 0.3:
375
  light_contribution = weights.get("light_features", 1.2) * 1.0
376
 
 
384
  edges_density = features.get("edges_density", 0)
385
 
386
  # 高邊界評分暗示封閉環境(室內)
387
+ if boundary_edge_score > 0.3:
388
+ boundary_contribution = weights.get("boundary_features", 1.2) * 2
389
+ elif boundary_edge_score > 0.2:
390
+ boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
391
+
392
  indoor_score += boundary_contribution
393
  feature_contributions["boundary_features"] = boundary_contribution
394
 
 
415
  combined_uniformity = (features["brightness_uniformity"] +
416
  features.get("ceiling_uniformity", 0)) / 2
417
 
418
+ if combined_uniformity > 0.5:
419
  gradient_contribution = weights["gradient_ratio"] * 0.7
420
  else:
421
  gradient_contribution = -weights["gradient_ratio"] * 0.3
 
430
 
431
  # 調整亮點分析邏輯
432
  if circular_light_count >= 1: # 即使只有一個圓形光源
433
+ bright_spot_contribution = weights["bright_spots"] * 1.5
434
  elif bright_spot_count < 5: # 適當放寬閾值
435
  bright_spot_contribution = weights["bright_spots"] * 0.5
436
  elif bright_spot_count > 15: # 大量亮點比較有可能為室外
 
441
 
442
  # 8. 色調分析
443
  yellow_contribution = 0
444
+ if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
445
+ if features.get("indoor_light_score", 0) > 0.2:
446
  yellow_contribution = weights["color_tone"] * 0.8
447
  else:
448
  yellow_contribution = weights["color_tone"] * 0.5
 
452
 
453
  if features.get("blue_ratio", 0) > 0.7:
454
  # 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
455
+ if (features.get("ceiling_uniformity", 0) > 0.6 or
456
+ features.get("boundary_edge_score", 0) > 0.3 or
457
  features.get("indoor_light_score", 0) > 0):
458
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
459
  else:
460
  blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
461
  else:
 
534
  # 1: 窗戶和牆壁形成的直角
535
  if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
536
  bedroom_indicators += 1.5 # 增加權重
537
+
538
  # 2: 天花板和光源
539
  if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
540
+ bedroom_indicators += 2.5
541
+
542
  # 3: 良好對比度的牆壁顏色,適合臥房還有客廳
543
  if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
544
+ bedroom_indicators += 1.5
545
+
546
  # 特殊的檢測 4: 檢測窗戶
547
  if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
548
+ bedroom_indicators += 1.5
549
+
550
  # 如果滿足足夠的家居指標,提高多點室內判斷分數
551
  if bedroom_indicators >= 3:
552
  # 增加��居環境評分
 
576
  def _determine_lighting_conditions(self, features, is_indoor):
577
  """
578
  基於特徵和室內/室外判斷確定光照條件。
579
+
580
  Args:
581
  features: 特徵字典
582
  is_indoor: 是否是室內環境
583
+
584
  Returns:
585
  Dict: 光照條件分析結果
586
  """
 
588
  time_of_day = "unknown"
589
  confidence = 0.5
590
  diagnostics = {}
591
+
592
  avg_brightness = features["avg_brightness"]
593
  dark_pixel_ratio = features["dark_pixel_ratio"]
594
  yellow_orange_ratio = features["yellow_orange_ratio"]
595
  blue_ratio = features["blue_ratio"]
596
  gray_ratio = features["gray_ratio"]
597
+
598
  # 基於室內/室外分別判斷
599
  if is_indoor:
600
  # 計算室內住宅自然光指標
601
  natural_window_light = 0
602
+
603
  # 檢查窗戶特徵和光線特性
604
+ if (features.get("blue_ratio", 0) > 0.1 and
605
  features.get("sky_brightness", 0) > avg_brightness * 1.1):
606
  natural_window_light += 1
607
+
608
  # 檢查均勻柔和的光線分布
609
+ if (features.get("brightness_uniformity", 0) > 0.65 and
610
  features.get("brightness_std", 0) < 70):
611
  natural_window_light += 1
612
+
613
  # 檢查暖色調比例
614
  if features.get("warm_ratio", 0) > 0.2:
615
  natural_window_light += 1
616
+
617
  # 家居環境指標
618
  home_env_score = features.get("home_environment_pattern", 0)
619
  if home_env_score > 1.5:
620
  natural_window_light += 1
621
+
622
  # 1. 室內明亮環境,可能有窗戶自然光
623
  if avg_brightness > 130:
624
  # 檢測自然光住宅空間 - 新增類型!
 
645
  time_of_day = "indoor_dim"
646
  confidence = 0.65 + dark_pixel_ratio / 3
647
  diagnostics["reason"] = "Low brightness in indoor environment"
648
+
649
  # 1. 檢測設計師風格住宅,可以偵測到比較多種類的狀況
650
  designer_residential_score = 0
651
  # 檢測特色燈具
 
660
  # 檢測家居環境特徵
661
  if home_env_score > 1.5:
662
  designer_residential_score += 1
663
+
664
  if designer_residential_score >= 3 and home_env_score > 1.5:
665
+ time_of_day = "indoor_designer_residential"
666
  confidence = 0.85
667
  diagnostics["special_case"] = "Designer residential lighting with decorative elements"
668
+
669
  # 2. 檢測餐廳/酒吧場景
670
  elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
671
  if features["warm_ratio"] > 0.4:
672
  time_of_day = "indoor_restaurant"
673
  confidence = 0.65 + yellow_orange_ratio / 4
674
  diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
675
+
676
  # 3. 檢測商業照明空間
677
  elif avg_brightness > 120 and features["bright_spot_count"] > 4:
678
  # 增加商業照明判別的精確度
 
685
  # 整體照明結構化布局
686
  if features.get("light_distribution_uniformity", 0) > 0.6:
687
  commercial_score += 0.5
688
+
689
  if commercial_score > 0.6 and designer_residential_score < 3:
690
  time_of_day = "indoor_commercial"
691
  confidence = 0.7 + commercial_score / 5
 
794
  """
795
  return {
796
  "indoor_outdoor_weights": {
797
+ "blue_ratio": 0.6,
798
+ "brightness_uniformity": 1.2,
799
+ "gradient_ratio": 0.7,
800
+ "bright_spots": 0.8,
801
+ "color_tone": 0.5,
802
+ "sky_brightness": 0.9,
803
+ "brightness_variation": 0.7,
804
+ "ceiling_features": 1.5,
805
+ "light_features": 1.1,
806
+ "boundary_features": 2.8,
807
+ "street_features": 2,
808
+ "building_features": 1.6
809
  },
810
  "include_diagnostics": True
811
  }
requirements.txt CHANGED
@@ -7,3 +7,5 @@ numpy>=1.23.5
7
  matplotlib>=3.7.0
8
  gradio>=3.32.0
9
  git+https://github.com/openai/CLIP.git
 
 
 
7
  matplotlib>=3.7.0
8
  gradio>=3.32.0
9
  git+https://github.com/openai/CLIP.git
10
+ yt-dlp>=2023.3.4
11
+ requests>=2.28.1
scene_analyzer.py CHANGED
@@ -17,7 +17,6 @@ class SceneAnalyzer:
17
  def __init__(self, class_names: Dict[int, str] = None):
18
  """
19
  Initialize the scene analyzer with optional class name mappings.
20
-
21
  Args:
22
  class_names: Dictionary mapping class IDs to class names (optional)
23
  """
@@ -49,14 +48,12 @@ class SceneAnalyzer:
49
  functional_zones=None):
50
  """
51
  生成場景描述。
52
-
53
  Args:
54
  scene_type: 識別的場景類型
55
  detected_objects: 檢測到的物體列表
56
  confidence: 場景分類置信度
57
  lighting_info: 照明條件信息(可選)
58
  functional_zones: 功能區域信息(可選)
59
-
60
  Returns:
61
  str: 生成的場景描述
62
  """
@@ -101,13 +98,11 @@ class SceneAnalyzer:
101
  def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
102
  """
103
  Analyze detection results to determine scene type and provide understanding.
104
-
105
  Args:
106
  detection_result: Detection result from YOLOv8
107
  lighting_info: Optional lighting condition analysis results
108
  class_confidence_threshold: Minimum confidence to consider an object
109
  scene_confidence_threshold: Minimum confidence to determine a scene
110
-
111
  Returns:
112
  Dictionary with scene analysis results
113
  """
@@ -141,7 +136,7 @@ class SceneAnalyzer:
141
  if not detected_objects:
142
  return {
143
  "scene_type": "unknown",
144
- "confidence": 0.0,
145
  "description": "No objects with sufficient confidence detected.",
146
  "objects_present": [],
147
  "object_count": 0,
@@ -265,10 +260,8 @@ class SceneAnalyzer:
265
  def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
266
  """
267
  Compute confidence scores for each scene type based on detected objects.
268
-
269
  Args:
270
  detected_objects: List of detected objects
271
-
272
  Returns:
273
  Dictionary mapping scene types to confidence scores
274
  """
@@ -308,7 +301,7 @@ class SceneAnalyzer:
308
  optional_score = optional_ratio * 0.3 # 30% of score from optional objects
309
 
310
  # Bonus for having multiple instances of key objects
311
- multiple_bonus = 0.0
312
  for class_id in required_present:
313
  if class_counts.get(class_id, 0) > 1:
314
  multiple_bonus += 0.05 # 5% bonus per additional key object type
@@ -330,10 +323,8 @@ class SceneAnalyzer:
330
  def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
331
  """
332
  Determine the most likely scene type based on scores.
333
-
334
  Args:
335
  scene_scores: Dictionary mapping scene types to confidence scores
336
-
337
  Returns:
338
  Tuple of (best_scene_type, confidence)
339
  """
@@ -350,11 +341,9 @@ class SceneAnalyzer:
350
  def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
351
  """
352
  融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
353
-
354
  Args:
355
  yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
356
  clip_scene_scores: 基於 CLIP 分析的場景分數
357
-
358
  Returns:
359
  Dict: 融合後的場景分數
360
  """
 
17
  def __init__(self, class_names: Dict[int, str] = None):
18
  """
19
  Initialize the scene analyzer with optional class name mappings.
 
20
  Args:
21
  class_names: Dictionary mapping class IDs to class names (optional)
22
  """
 
48
  functional_zones=None):
49
  """
50
  生成場景描述。
 
51
  Args:
52
  scene_type: 識別的場景類型
53
  detected_objects: 檢測到的物體列表
54
  confidence: 場景分類置信度
55
  lighting_info: 照明條件信息(可選)
56
  functional_zones: 功能區域信息(可選)
 
57
  Returns:
58
  str: 生成的場景描述
59
  """
 
98
  def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
99
  """
100
  Analyze detection results to determine scene type and provide understanding.
 
101
  Args:
102
  detection_result: Detection result from YOLOv8
103
  lighting_info: Optional lighting condition analysis results
104
  class_confidence_threshold: Minimum confidence to consider an object
105
  scene_confidence_threshold: Minimum confidence to determine a scene
 
106
  Returns:
107
  Dictionary with scene analysis results
108
  """
 
136
  if not detected_objects:
137
  return {
138
  "scene_type": "unknown",
139
+ "confidence": 0,
140
  "description": "No objects with sufficient confidence detected.",
141
  "objects_present": [],
142
  "object_count": 0,
 
260
  def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
261
  """
262
  Compute confidence scores for each scene type based on detected objects.
 
263
  Args:
264
  detected_objects: List of detected objects
 
265
  Returns:
266
  Dictionary mapping scene types to confidence scores
267
  """
 
301
  optional_score = optional_ratio * 0.3 # 30% of score from optional objects
302
 
303
  # Bonus for having multiple instances of key objects
304
+ multiple_bonus = 0
305
  for class_id in required_present:
306
  if class_counts.get(class_id, 0) > 1:
307
  multiple_bonus += 0.05 # 5% bonus per additional key object type
 
323
  def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
324
  """
325
  Determine the most likely scene type based on scores.
 
326
  Args:
327
  scene_scores: Dictionary mapping scene types to confidence scores
 
328
  Returns:
329
  Tuple of (best_scene_type, confidence)
330
  """
 
341
  def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
342
  """
343
  融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
 
344
  Args:
345
  yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
346
  clip_scene_scores: 基於 CLIP 分析的場景分數
 
347
  Returns:
348
  Dict: 融合後的場景分數
349
  """
style.py CHANGED
@@ -268,6 +268,40 @@ class Style:
268
  padding: 0 !important;
269
  }
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  /* 結果容器樣式 */
272
  .result-container {
273
  width: 100% !important;
@@ -356,6 +390,111 @@ class Style:
356
  box-sizing: border-box !important;
357
  }
358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  /* 響應式調整 */
360
  @media (max-width: 768px) {
361
  .app-title {
@@ -375,5 +514,6 @@ class Style:
375
  min-height: 150px !important;
376
  }
377
  }
 
378
  """
379
  return css
 
268
  padding: 0 !important;
269
  }
270
 
271
+ /* 場景分析描述區域樣式 */
272
+ .scene-description-box {
273
+ background-color: #f8f9fa !important;
274
+ border: 1px solid #e2e8f0 !important;
275
+ border-radius: 8px !important;
276
+ padding: 15px !important;
277
+ margin: 10px 0 20px 0 !important;
278
+ box-shadow: 0 1px 3px rgba(0,0,0,0.05) !important;
279
+ font-family: Arial, sans-serif !important;
280
+ line-height: 1.7 !important;
281
+ color: #2D3748 !important;
282
+ font-size: 16px !important;
283
+ width: 100% !important;
284
+ box-sizing: border-box !important;
285
+ }
286
+
287
+ #scene_analysis_description_text {
288
+ background-color: #f0f0f0 !important; /* 淺灰色背景 */
289
+ padding: 15px !important; /* 內邊距,讓文字和邊框有點空間 */
290
+ border-radius: 8px !important; /* 圓角 */
291
+ margin: 10px 0 20px 0 !important; /* 其他元素的間距,特別是上下的part */
292
+ display: block !important;
293
+ width: 100% !important;
294
+ box-sizing: border-box !important;
295
+ }
296
+
297
+ #scene_analysis_description_text p {
298
+ margin: 0 !important;
299
+ color: #2D3748 !important; /* 確保文字顏色 */
300
+ font-family: Arial, sans-serif !important;
301
+ font-size: 16px !important; /* 你可以調整文字大小 */
302
+ line-height: 1.7 !important;
303
+ }
304
+
305
  /* 結果容器樣式 */
306
  .result-container {
307
  width: 100% !important;
 
390
  box-sizing: border-box !important;
391
  }
392
 
393
+ /* Video summary HTML 容器與內容樣式 */
394
+ #video-summary-html-output {
395
+ width: 100% !important;
396
+ box-sizing: border-box !important;
397
+ padding: 0 !important;
398
+ margin: 0 !important;
399
+ }
400
+
401
+ .video-summary-content-wrapper {
402
+ width: 100% !important;
403
+ padding: 16px !important;
404
+ line-height: 1.8 !important;
405
+ white-space: pre-wrap !important;
406
+ word-wrap: break-word !important;
407
+ border-radius: 8px !important;
408
+ min-height: 250px !important;
409
+ max-height: 600px !important;
410
+ overflow-y: auto !important;
411
+ border: 1px solid #e2e8f0 !important;
412
+ background-color: white !important;
413
+ display: block !important;
414
+ font-family: 'Arial', sans-serif !important;
415
+ font-size: 14px !important;
416
+ margin: 0 !important;
417
+ }
418
+
419
+ .video-summary-content-wrapper pre {
420
+ white-space: pre-wrap !important;
421
+ word-wrap: break-word !important;
422
+ margin: 0 !important;
423
+ padding: 0 !important;
424
+ font-family: 'Arial', sans-serif !important;
425
+ font-size: 14px !important;
426
+ line-height: 1.8 !important;
427
+ color: #2D3748 !important;
428
+ }
429
+
430
+ /* 視頻結果面板相關樣式 */
431
+ .video-result-panel {
432
+ padding: 1rem !important;
433
+ background: white !important;
434
+ border-radius: 10px !important;
435
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08) !important;
436
+ }
437
+
438
+ .video-output-container {
439
+ width: 100% !important;
440
+ margin-bottom: 1.5rem !important;
441
+ border-radius: 8px !important;
442
+ overflow: hidden !important;
443
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
444
+ }
445
+
446
+ /* 視頻統計資料顯示增強 */
447
+ .video-stats-display {
448
+ background: white !important;
449
+ border-radius: 8px !important;
450
+ padding: 1rem !important;
451
+ box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
452
+ width: 100% !important;
453
+ min-height: 200px !important;
454
+ max-height: 400px !important;
455
+ overflow-y: auto !important;
456
+ font-family: monospace !important;
457
+ box-sizing: border-box !important;
458
+ color: #2D3748 !important;
459
+ }
460
+
461
+ .custom-video-url-input {
462
+ width: 100% !important;
463
+ }
464
+
465
+ .custom-video-url-input textarea {
466
+ width: 100% !important;
467
+ min-height: 120px !important;
468
+ padding: 15px !important;
469
+ font-size: 16px !important;
470
+ line-height: 1.6 !important;
471
+ background-color: #F7FAFC !important;
472
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
473
+ border: 2px solid #CBD5E0 !important;
474
+ border-radius: 8px !important;
475
+ }
476
+
477
+ .custom-video-url-input textarea:focus {
478
+ border-color: #4299E1 !important;
479
+ box-shadow: 0 0 0 3px rgba(66, 153, 225, 0.2) !important;
480
+ }
481
+
482
+ /* 輸入框容器100%寬度 */
483
+ .custom-video-url-input > div {
484
+ width: 100% !important;
485
+ max-width: 100% !important;
486
+ }
487
+
488
+ /* 動畫效果, 增加互動感 */
489
+ @keyframes fadeIn {
490
+ from { opacity: 0; }
491
+ to { opacity: 1; }
492
+ }
493
+
494
+ .video-result-panel > * {
495
+ animation: fadeIn 0.5s ease-in-out;
496
+ }
497
+
498
  /* 響應式調整 */
499
  @media (max-width: 768px) {
500
  .app-title {
 
514
  min-height: 150px !important;
515
  }
516
  }
517
+
518
  """
519
  return css
video_processor.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ import tempfile
4
+ import uuid
5
+ from PIL import Image
6
+ import numpy as np
7
+ from typing import Dict, List, Tuple, Any, Optional
8
+ import time
9
+ from collections import defaultdict
10
+
11
+ from image_processor import ImageProcessor
12
+ from evaluation_metrics import EvaluationMetrics
13
+ from scene_analyzer import SceneAnalyzer
14
+ from detection_model import DetectionModel
15
+
16
+ class VideoProcessor:
17
+ """
18
+ Handles the processing of video files, including object detection
19
+ and scene analysis on selected frames.
20
+ """
21
+ def __init__(self, image_processor: ImageProcessor):
22
+ """
23
+ Initializes the VideoProcessor.
24
+
25
+ Args:
26
+ image_processor (ImageProcessor): An initialized ImageProcessor instance.
27
+ """
28
+ self.image_processor = image_processor
29
+
30
+ def process_video_file(self,
31
+ video_path: str,
32
+ model_name: str,
33
+ confidence_threshold: float,
34
+ process_interval: int = 5,
35
+ scene_desc_interval_sec: int = 3) -> Tuple[Optional[str], str, Dict]:
36
+ """
37
+ Processes an uploaded video file, performs detection and periodic scene analysis,
38
+ and returns the path to the annotated output video file along with a summary.
39
+
40
+ Args:
41
+ video_path (str): Path to the input video file.
42
+ model_name (str): Name of the YOLO model to use.
43
+ confidence_threshold (float): Confidence threshold for object detection.
44
+ process_interval (int): Process every Nth frame. Defaults to 5.
45
+ scene_desc_interval_sec (int): Update scene description every N seconds. Defaults to 3.
46
+
47
+ Returns:
48
+ Tuple[Optional[str], str, Dict]: (Path to output video or None, Summary text, Statistics dictionary)
49
+ """
50
+ if not video_path or not os.path.exists(video_path):
51
+ print(f"Error: Video file not found at {video_path}")
52
+ return None, "Error: Video file not found.", {}
53
+
54
+ print(f"Starting video processing for: {video_path}")
55
+ start_time = time.time()
56
+
57
+ cap = cv2.VideoCapture(video_path)
58
+ if not cap.isOpened():
59
+ print(f"Error: Could not open video file {video_path}")
60
+ return None, "Error opening video file.", {}
61
+
62
+ # Get video properties
63
+ fps = cap.get(cv2.CAP_PROP_FPS)
64
+ if fps <= 0: # Handle case where fps is not available or invalid
65
+ fps = 30 # Assume a default fps
66
+ print(f"Warning: Could not get valid FPS for video. Assuming {fps} FPS.")
67
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
68
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
69
+ total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
70
+ print(f"Video properties: {width}x{height} @ {fps:.2f} FPS, Total Frames: {total_frames_video}")
71
+
72
+ # Calculate description update interval in frames
73
+ description_update_interval_frames = int(fps * scene_desc_interval_sec)
74
+ if description_update_interval_frames < 1:
75
+ description_update_interval_frames = int(fps) # Update at least once per second if interval is too short
76
+
77
+ object_trackers = {} # 儲存ID與物體的映射
78
+ last_detected_objects = {} # 儲存上一次檢測到的物體資訊
79
+ next_object_id = 0 # 下一個可用的物體ID
80
+ tracking_threshold = 0.6 # 相同物體的IoU
81
+ object_colors = {} # 每個被追蹤的物體分配固定顏色
82
+
83
+ # Setup Output Video
84
+ output_filename = f"processed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
85
+ temp_dir = tempfile.gettempdir() # Use system's temp directory
86
+ output_path = os.path.join(temp_dir, output_filename)
87
+ # Ensure the output path has a compatible extension (like .mp4)
88
+ if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
89
+ output_path += ".mp4"
90
+
91
+ # Use 'mp4v' for MP4, common and well-supported
92
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
93
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
94
+ if not out.isOpened():
95
+ print(f"Error: Could not open VideoWriter for path: {output_path}")
96
+ cap.release()
97
+ return None, f"Error creating output video file at {output_path}.", {}
98
+ print(f"Output video will be saved to: {output_path}")
99
+
100
+ frame_count = 0
101
+ processed_frame_count = 0
102
+ all_stats = [] # Store stats for each processed frame
103
+ summary_lines = []
104
+ last_description = "Analyzing scene..." # Initial description
105
+ frame_since_last_desc = description_update_interval_frames # Trigger analysis on first processed frame
106
+
107
+ try:
108
+ while True:
109
+ ret, frame = cap.read()
110
+ if not ret:
111
+ break # End of video
112
+
113
+ frame_count += 1
114
+ frame_since_last_desc += 1
115
+ current_frame_annotated = False # Flag if this frame was processed and annotated
116
+
117
+ # Process frame based on interval
118
+ if frame_count % process_interval == 0:
119
+ processed_frame_count += 1
120
+ print(f"Processing frame {frame_count}...")
121
+ current_frame_annotated = True
122
+
123
+ # Use ImageProcessor for single-frame tasks
124
+ # 1. Convert frame format BGR -> RGB -> PIL
125
+ try:
126
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
127
+ pil_image = Image.fromarray(frame_rgb)
128
+ except Exception as e:
129
+ print(f"Error converting frame {frame_count}: {e}")
130
+ continue # Skip this frame
131
+
132
+ # 2. Get appropriate model instance
133
+ # Confidence is passed from UI, model_name too
134
+ model_instance = self.image_processor.get_model_instance(model_name, confidence_threshold)
135
+ if not model_instance or not model_instance.is_model_loaded:
136
+ print(f"Error: Model {model_name} not loaded. Skipping frame {frame_count}.")
137
+ # Draw basic frame without annotation
138
+ cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
139
+ cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
140
+ out.write(frame)
141
+ continue
142
+
143
+
144
+ # 3. Perform detection
145
+ detection_result = model_instance.detect(pil_image) # Use PIL image
146
+
147
+ current_description_for_frame = last_description # Default to last known description
148
+ scene_analysis_result = None
149
+ stats = {}
150
+
151
+ if detection_result and hasattr(detection_result, 'boxes') and len(detection_result.boxes) > 0:
152
+ # Ensure SceneAnalyzer is ready within ImageProcessor
153
+ if not hasattr(self.image_processor, 'scene_analyzer') or self.image_processor.scene_analyzer is None:
154
+ print("Initializing SceneAnalyzer...")
155
+ # Pass class names from the current detection result
156
+ self.image_processor.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
157
+ elif self.image_processor.scene_analyzer.class_names is None:
158
+ # Update class names if they were missing
159
+ self.image_processor.scene_analyzer.class_names = detection_result.names
160
+ if hasattr(self.image_processor.scene_analyzer, 'spatial_analyzer'):
161
+ self.image_processor.scene_analyzer.spatial_analyzer.class_names = detection_result.names
162
+
163
+
164
+ # 4. Perform Scene Analysis (periodically)
165
+ if frame_since_last_desc >= description_update_interval_frames:
166
+ print(f"Analyzing scene at frame {frame_count} (threshold: {description_update_interval_frames} frames)...")
167
+ # Pass lighting_info=None for now, as it's disabled for performance
168
+ scene_analysis_result = self.image_processor.analyze_scene(detection_result, lighting_info=None)
169
+ current_description_for_frame = scene_analysis_result.get("description", last_description)
170
+ last_description = current_description_for_frame # Cache the new description
171
+ frame_since_last_desc = 0 # Reset counter
172
+
173
+ # 5. Calculate Statistics for this frame
174
+ stats = EvaluationMetrics.calculate_basic_stats(detection_result)
175
+ stats['frame_number'] = frame_count # Add frame number to stats
176
+ all_stats.append(stats)
177
+
178
+ # 6. Draw annotations
179
+ names = detection_result.names
180
+ boxes = detection_result.boxes.xyxy.cpu().numpy()
181
+ classes = detection_result.boxes.cls.cpu().numpy().astype(int)
182
+ confs = detection_result.boxes.conf.cpu().numpy()
183
+
184
+ def calculate_iou(box1, box2):
185
+ """Calculate Intersection IOU value"""
186
+ x1_1, y1_1, x2_1, y2_1 = box1
187
+ x1_2, y1_2, x2_2, y2_2 = box2
188
+
189
+ xi1 = max(x1_1, x1_2)
190
+ yi1 = max(y1_1, y1_2)
191
+ xi2 = min(x2_1, x2_2)
192
+ yi2 = min(y2_1, y2_2)
193
+
194
+ inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
195
+ box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
196
+ box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
197
+
198
+ union_area = box1_area + box2_area - inter_area
199
+
200
+ return inter_area / union_area if union_area > 0 else 0
201
+
202
+ # 處理當前幀中的所有檢測
203
+ current_detected_objects = {}
204
+
205
+ for box, cls_id, conf in zip(boxes, classes, confs):
206
+ x1, y1, x2, y2 = map(int, box)
207
+
208
+ # 查找最匹配的已追蹤物體
209
+ best_match_id = None
210
+ best_match_iou = 0
211
+
212
+ for obj_id, (old_box, old_cls_id, _) in last_detected_objects.items():
213
+ if old_cls_id == cls_id: # 同一類別才比較
214
+ iou = calculate_iou(box, old_box)
215
+ if iou > tracking_threshold and iou > best_match_iou:
216
+ best_match_id = obj_id
217
+ best_match_iou = iou
218
+
219
+ # 如果找到匹配,使用現有ID;否則分配新ID
220
+ if best_match_id is not None:
221
+ obj_id = best_match_id
222
+ else:
223
+ obj_id = next_object_id
224
+ next_object_id += 1
225
+ # 為新物體分配固定顏色 - 使用更明顯的顏色
226
+ # 使用更明顯的顏色,避免白色
227
+ bright_colors = [
228
+ (0, 0, 255), # red
229
+ (0, 255, 0), # green
230
+ (255, 0, 0), # blue
231
+ (0, 255, 255), # yellow
232
+ (255, 0, 255), # purple
233
+ (255, 128, 0), # orange
234
+ (128, 0, 255) # purple
235
+ ]
236
+ object_colors[obj_id] = bright_colors[obj_id % len(bright_colors)]
237
+
238
+ # update tracking info
239
+ current_detected_objects[obj_id] = (box, cls_id, conf)
240
+
241
+ color = object_colors.get(obj_id, (0, 255, 0)) # default is green
242
+ label = f"{names.get(cls_id, 'Unknown')}-{obj_id}: {conf:.2f}"
243
+
244
+ # 平滑化邊界框:如果是已知物體,與上一幀位置平均
245
+ if obj_id in last_detected_objects:
246
+ old_box, _, _ = last_detected_objects[obj_id]
247
+ old_x1, old_y1, old_x2, old_y2 = map(int, old_box)
248
+ # 平滑係數
249
+ alpha = 0.7 # current weight
250
+ beta = 0.3 # history weight
251
+
252
+ x1 = int(alpha * x1 + beta * old_x1)
253
+ y1 = int(alpha * y1 + beta * old_y1)
254
+ x2 = int(alpha * x2 + beta * old_x2)
255
+ y2 = int(alpha * y2 + beta * old_y2)
256
+
257
+ # draw box and label
258
+ cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
259
+ # add text
260
+ (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
261
+ cv2.rectangle(frame, (x1, y1 - h - 10), (x1 + w, y1 - 10), color, -1)
262
+ cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
263
+
264
+ # update tracking info
265
+ last_detected_objects = current_detected_objects.copy()
266
+
267
+
268
+ # Draw the current scene description on the frame
269
+ cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA) # Black outline
270
+ cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA) # White text
271
+
272
+ # Write the frame (annotated or original) to the output video
273
+ # Draw last known description if this frame wasn't processed
274
+ if not current_frame_annotated:
275
+ cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
276
+ cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
277
+
278
+ out.write(frame) # Write frame to output file
279
+
280
+ except Exception as e:
281
+ print(f"Error during video processing loop for {video_path}: {e}")
282
+ import traceback
283
+ traceback.print_exc()
284
+ summary_lines.append(f"An error occurred during processing: {e}")
285
+ finally:
286
+ # Release resources
287
+ cap.release()
288
+ out.release()
289
+ print(f"Video processing finished. Resources released. Output path: {output_path}")
290
+ if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
291
+ print(f"Error: Output video file was not created or is empty at {output_path}")
292
+ summary_lines.append("Error: Failed to create output video.")
293
+ output_path = None
294
+
295
+ end_time = time.time()
296
+ processing_time = end_time - start_time
297
+ summary_lines.insert(0, f"Finished processing in {processing_time:.2f} seconds.")
298
+ summary_lines.insert(1, f"Processed {processed_frame_count} frames out of {frame_count} (interval: {process_interval} frames).")
299
+ summary_lines.insert(2, f"Scene description updated approximately every {scene_desc_interval_sec} seconds.")
300
+
301
+ # Generate Aggregate Statistics
302
+ aggregated_stats = {
303
+ "total_frames_read": frame_count,
304
+ "total_frames_processed": processed_frame_count,
305
+ "avg_objects_per_processed_frame": 0, # Calculate below
306
+ "cumulative_detections": {}, # Total times each class was detected
307
+ "max_concurrent_detections": {} # Max count of each class in a single processed frame
308
+ }
309
+ object_cumulative_counts = {}
310
+ object_max_concurrent_counts = {} # Store the max count found for each object type
311
+ total_detected_in_processed = 0
312
+
313
+ # Iterate through stats collected from each processed frame
314
+ for frame_stats in all_stats:
315
+ total_objects_in_frame = frame_stats.get("total_objects", 0)
316
+ total_detected_in_processed += total_objects_in_frame
317
+
318
+ # Iterate through object classes detected in this frame
319
+ for obj_name, obj_data in frame_stats.get("class_statistics", {}).items():
320
+ count_in_frame = obj_data.get("count", 0)
321
+
322
+ # Cumulative count
323
+ if obj_name not in object_cumulative_counts:
324
+ object_cumulative_counts[obj_name] = 0
325
+ object_cumulative_counts[obj_name] += count_in_frame
326
+
327
+ # Max concurrent count
328
+ if obj_name not in object_max_concurrent_counts:
329
+ object_max_concurrent_counts[obj_name] = 0
330
+ # Update the max count if the current frame's count is higher
331
+ object_max_concurrent_counts[obj_name] = max(object_max_concurrent_counts[obj_name], count_in_frame)
332
+
333
+ # Add sorted results to the final dictionary
334
+ aggregated_stats["cumulative_detections"] = dict(sorted(object_cumulative_counts.items(), key=lambda item: item[1], reverse=True))
335
+ aggregated_stats["max_concurrent_detections"] = dict(sorted(object_max_concurrent_counts.items(), key=lambda item: item[1], reverse=True))
336
+
337
+ # Calculate average objects per processed frame
338
+ if processed_frame_count > 0:
339
+ aggregated_stats["avg_objects_per_processed_frame"] = round(total_detected_in_processed / processed_frame_count, 2)
340
+
341
+ summary_text = "\n".join(summary_lines)
342
+ print("Generated Summary:\n", summary_text)
343
+ print("Aggregated Stats (Revised):\n", aggregated_stats) # Print the revised stats
344
+
345
+ # Return the potentially updated output_path
346
+ return output_path, summary_text, aggregated_stats