Spaces:
Running
on
Zero
Running
on
Zero
Add new feature "Video Process" and fix format issue
Browse files- app.py +514 -360
- clip_analyzer.py +2 -1
- enhance_scene_describer.py +257 -69
- lighting_analyzer.py +71 -71
- requirements.txt +2 -0
- scene_analyzer.py +2 -13
- style.py +140 -0
- video_processor.py +346 -0
app.py
CHANGED
@@ -3,6 +3,10 @@ import numpy as np
|
|
3 |
import matplotlib.pyplot as plt
|
4 |
import gradio as gr
|
5 |
from typing import Dict, List, Any, Optional, Tuple
|
|
|
|
|
|
|
|
|
6 |
import spaces
|
7 |
|
8 |
from detection_model import DetectionModel
|
@@ -10,441 +14,591 @@ from color_mapper import ColorMapper
|
|
10 |
from evaluation_metrics import EvaluationMetrics
|
11 |
from style import Style
|
12 |
from image_processor import ImageProcessor
|
|
|
13 |
|
14 |
-
# Initialize
|
15 |
image_processor = ImageProcessor()
|
|
|
16 |
|
|
|
17 |
def get_all_classes():
|
18 |
-
"""
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
(74, 'clock'), (75, 'vase'), (76, 'scissors'), (77, 'teddy bear'), (78, 'hair drier'),
|
51 |
-
(79, 'toothbrush')
|
52 |
-
]
|
53 |
|
54 |
@spaces.GPU
|
55 |
-
def
|
56 |
-
"""
|
57 |
-
|
58 |
-
|
59 |
-
Args:
|
60 |
-
image: Input image
|
61 |
-
model_name: Name of the model to use
|
62 |
-
confidence_threshold: Confidence threshold for detection
|
63 |
-
filter_classes: Optional list of classes to filter results
|
64 |
-
|
65 |
-
Returns:
|
66 |
-
Tuple of results including lighting conditions
|
67 |
-
"""
|
68 |
try:
|
69 |
-
|
70 |
if filter_classes:
|
71 |
-
|
|
|
|
|
72 |
for class_str in filter_classes:
|
|
|
|
|
73 |
try:
|
74 |
-
|
75 |
-
class_id
|
76 |
-
|
77 |
-
except:
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
result_image, result_text, stats = image_processor.process_image(
|
82 |
image,
|
83 |
model_name,
|
84 |
confidence_threshold,
|
85 |
-
|
86 |
)
|
87 |
|
88 |
-
# Format
|
89 |
formatted_stats = image_processor.format_json_for_display(stats)
|
90 |
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
fig, ax = plt.subplots(figsize=(8, 6))
|
94 |
-
ax.text(0.5, 0.5, "No detection data
|
95 |
-
ha='center', va='center', fontsize=14, fontfamily='Arial')
|
96 |
-
ax.set_xlim(0, 1)
|
97 |
-
ax.set_ylim(0, 1)
|
98 |
ax.axis('off')
|
99 |
plot_figure = fig
|
100 |
-
else:
|
101 |
-
# Prepare visualization data
|
102 |
-
available_classes = dict(get_all_classes())
|
103 |
-
viz_data = image_processor.prepare_visualization_data(stats, available_classes)
|
104 |
-
|
105 |
-
# Create plot
|
106 |
-
plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
|
107 |
|
108 |
# Extract scene analysis info
|
109 |
scene_analysis = stats.get("scene_analysis", {})
|
110 |
-
|
111 |
-
scene_desc
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
</div>
|
121 |
-
"""
|
122 |
-
|
123 |
-
# Extract lighting conditions
|
124 |
-
lighting_conditions = scene_analysis.get("lighting_conditions",
|
125 |
-
{"time_of_day": "unknown", "confidence": 0.0})
|
126 |
-
|
127 |
-
# 準備活動列表
|
128 |
-
activities = scene_analysis.get("possible_activities", [])
|
129 |
-
if not activities:
|
130 |
-
activities_data = [["No activities detected"]]
|
131 |
else:
|
132 |
-
|
133 |
|
134 |
-
#
|
135 |
-
|
136 |
-
if not
|
137 |
-
safety_data = [["No safety concerns detected"]]
|
138 |
else:
|
139 |
-
safety_data = [[concern] for concern in
|
140 |
|
141 |
-
# 功能區域
|
142 |
zones = scene_analysis.get("functional_zones", {})
|
|
|
143 |
|
144 |
-
return result_image, result_text, formatted_stats, plot_figure,
|
|
|
145 |
|
146 |
except Exception as e:
|
147 |
-
|
148 |
import traceback
|
149 |
error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
|
150 |
-
|
151 |
-
|
152 |
-
# 創建一個簡單的錯誤圖
|
153 |
-
fig, ax = plt.subplots(figsize=(8, 6))
|
154 |
-
ax.text(0.5, 0.5, f"Error: {str(e)}",
|
155 |
-
ha='center', va='center', fontsize=14, fontfamily='Arial', color='red')
|
156 |
-
ax.set_xlim(0, 1)
|
157 |
-
ax.set_ylim(0, 1)
|
158 |
ax.axis('off')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
-
# 返回有效的默認值
|
161 |
-
return None, error_msg, "{}", fig, "Error processing image", [["No activities"]], [["No safety concerns"]], {}, {"time_of_day": "unknown", "confidence": 0}
|
162 |
|
|
|
163 |
def create_interface():
|
164 |
-
"""
|
165 |
css = Style.get_css()
|
166 |
-
|
167 |
-
# 獲取可用模型信息
|
168 |
available_models = DetectionModel.get_available_models()
|
169 |
model_choices = [model["model_file"] for model in available_models]
|
170 |
-
|
171 |
-
|
172 |
-
# 可用類別過濾選項
|
173 |
-
available_classes = get_all_classes()
|
174 |
-
class_choices = [f"{id}: {name}" for id, name in available_classes]
|
175 |
|
176 |
-
# 創建 Gradio Blocks 界面
|
177 |
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
|
178 |
-
|
|
|
179 |
with gr.Group(elem_classes="app-header"):
|
180 |
gr.HTML("""
|
181 |
<div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
|
182 |
<h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
|
183 |
-
|
184 |
-
<
|
185 |
-
|
186 |
-
<div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;">
|
187 |
-
<div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div>
|
188 |
-
</div>
|
189 |
-
|
190 |
<div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
|
191 |
-
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
|
192 |
-
|
193 |
-
</div>
|
194 |
-
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
|
195 |
-
<span style="margin-right: 6px;">🌐</span> Scene Understanding
|
196 |
-
</div>
|
197 |
-
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
|
198 |
-
<span style="margin-right: 6px;">📊</span> Visual Analysis
|
199 |
-
</div>
|
200 |
-
</div>
|
201 |
-
|
202 |
-
<div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
|
203 |
-
<p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
|
204 |
-
<span style="margin-right: 5px;">📱</span> iPhone users: HEIC images are not supported.
|
205 |
-
<a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG here</a> before uploading.
|
206 |
-
</p>
|
207 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
</div>
|
209 |
""")
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
)
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
1. Upload an image or use the camera
|
268 |
-
2. (Optional) Adjust settings like confidence threshold or model size (n, m, x)
|
269 |
-
3. Optionally filter to specific object classes
|
270 |
-
4. Click "Detect Objects" button
|
271 |
-
|
272 |
-
The model will identify objects in your image and display them with bounding boxes.
|
273 |
-
|
274 |
-
**Note:** Detection quality depends on image clarity and model settings.
|
275 |
-
""")
|
276 |
-
|
277 |
-
# 右側 - 結果顯示區
|
278 |
-
with gr.Column(scale=6, elem_classes="output-panel"):
|
279 |
-
with gr.Tabs(elem_classes="tabs"):
|
280 |
-
with gr.Tab("Detection Result"):
|
281 |
-
result_image = gr.Image(type="pil", label="Detection Result")
|
282 |
-
|
283 |
-
# details summary
|
284 |
-
with gr.Group(elem_classes="result-details-box"):
|
285 |
-
gr.HTML('<div class="section-heading">Detection Details</div>')
|
286 |
-
# 文本框設置,讓顯示會更寬
|
287 |
-
result_text = gr.Textbox(
|
288 |
-
label=None,
|
289 |
-
lines=15,
|
290 |
-
max_lines=20,
|
291 |
-
elem_classes="wide-result-text",
|
292 |
-
elem_id="detection-details",
|
293 |
-
container=False,
|
294 |
-
scale=2,
|
295 |
-
min_width=600
|
296 |
-
)
|
297 |
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
<div style="
|
308 |
-
<p style="
|
309 |
-
|
310 |
-
Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
|
311 |
-
Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
|
312 |
</p>
|
313 |
</div>
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
)
|
323 |
-
|
324 |
-
with gr.Row():
|
325 |
-
with gr.Column(scale=2):
|
326 |
-
activities_list = gr.Dataframe(
|
327 |
-
headers=["Activities"],
|
328 |
-
datatype=["str"],
|
329 |
-
col_count=1,
|
330 |
-
row_count=5,
|
331 |
-
elem_classes="full-width-element"
|
332 |
-
)
|
333 |
-
|
334 |
-
with gr.Column(scale=2):
|
335 |
-
safety_list = gr.Dataframe(
|
336 |
-
headers=["Safety Concerns"],
|
337 |
-
datatype=["str"],
|
338 |
-
col_count=1,
|
339 |
-
row_count=5,
|
340 |
-
elem_classes="full-width-element"
|
341 |
-
)
|
342 |
-
|
343 |
-
gr.HTML('<div class="section-heading">Functional Zones</div>')
|
344 |
-
zones_json = gr.JSON(label=None, elem_classes="json-box")
|
345 |
-
|
346 |
-
gr.HTML('<div class="section-heading">Lighting Conditions</div>')
|
347 |
-
lighting_info = gr.JSON(label=None, elem_classes="json-box")
|
348 |
-
|
349 |
-
with gr.Tab("Statistics"):
|
350 |
-
with gr.Row():
|
351 |
-
with gr.Column(scale=3, elem_classes="plot-column"):
|
352 |
-
gr.HTML('<div class="section-heading">Object Distribution</div>')
|
353 |
-
plot_output = gr.Plot(
|
354 |
-
label=None,
|
355 |
-
elem_classes="large-plot-container"
|
356 |
)
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
gr.HTML('<div class="section-heading">Detection Statistics</div>')
|
361 |
-
stats_json = gr.JSON(
|
362 |
-
label=None, # remove label
|
363 |
-
elem_classes="enhanced-json-display"
|
364 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
-
|
390 |
-
|
391 |
-
lambda: [f"{id}: {name}" for id, name in available_classes if id in people_classes],
|
392 |
-
outputs=class_filter
|
393 |
-
)
|
394 |
|
395 |
-
|
396 |
-
|
397 |
-
|
|
|
|
|
|
|
398 |
)
|
399 |
|
400 |
-
|
401 |
-
|
402 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
)
|
404 |
|
405 |
-
|
406 |
-
|
407 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
)
|
409 |
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
outputs=None,
|
422 |
-
fn=None,
|
423 |
-
cache_examples=False,
|
424 |
)
|
425 |
|
426 |
-
|
427 |
# Footer
|
428 |
gr.HTML("""
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
</div>
|
442 |
-
""")
|
443 |
|
444 |
return demo
|
445 |
|
|
|
446 |
if __name__ == "__main__":
|
447 |
-
|
448 |
|
449 |
-
|
450 |
-
demo.launch()
|
|
|
3 |
import matplotlib.pyplot as plt
|
4 |
import gradio as gr
|
5 |
from typing import Dict, List, Any, Optional, Tuple
|
6 |
+
import cv2
|
7 |
+
from PIL import Image
|
8 |
+
import tempfile
|
9 |
+
import uuid
|
10 |
import spaces
|
11 |
|
12 |
from detection_model import DetectionModel
|
|
|
14 |
from evaluation_metrics import EvaluationMetrics
|
15 |
from style import Style
|
16 |
from image_processor import ImageProcessor
|
17 |
+
from video_processor import VideoProcessor
|
18 |
|
19 |
+
# Initialize Processors
|
20 |
image_processor = ImageProcessor()
|
21 |
+
video_processor = VideoProcessor(image_processor)
|
22 |
|
23 |
+
# Helper Function
|
24 |
def get_all_classes():
|
25 |
+
"""Gets all available COCO classes."""
|
26 |
+
# Try to get from a loaded model first
|
27 |
+
if image_processor and image_processor.model_instances:
|
28 |
+
for model_instance in image_processor.model_instances.values():
|
29 |
+
if model_instance and model_instance.is_model_loaded:
|
30 |
+
try:
|
31 |
+
# Ensure class_names is a dict {id: name}
|
32 |
+
if isinstance(model_instance.class_names, dict):
|
33 |
+
return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Error getting class names from model: {e}")
|
36 |
+
|
37 |
+
# Fallback to standard COCO (ensure keys are ints)
|
38 |
+
default_classes = {
|
39 |
+
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
|
40 |
+
6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
|
41 |
+
11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
|
42 |
+
16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
|
43 |
+
22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
|
44 |
+
27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
|
45 |
+
32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
|
46 |
+
36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
|
47 |
+
40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
|
48 |
+
46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
|
49 |
+
51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
|
50 |
+
57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
|
51 |
+
62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
|
52 |
+
67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
|
53 |
+
72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
|
54 |
+
77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
|
55 |
+
}
|
56 |
+
return sorted(default_classes.items())
|
|
|
|
|
|
|
57 |
|
58 |
@spaces.GPU
|
59 |
+
def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None):
|
60 |
+
"""Processes a single uploaded image."""
|
61 |
+
print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
try:
|
63 |
+
class_ids_to_filter = None
|
64 |
if filter_classes:
|
65 |
+
class_ids_to_filter = []
|
66 |
+
available_classes_dict = dict(get_all_classes())
|
67 |
+
name_to_id = {name: id for id, name in available_classes_dict.items()}
|
68 |
for class_str in filter_classes:
|
69 |
+
class_name_or_id = class_str.split(":")[0].strip()
|
70 |
+
class_id = -1
|
71 |
try:
|
72 |
+
class_id = int(class_name_or_id)
|
73 |
+
if class_id not in available_classes_dict:
|
74 |
+
class_id = -1
|
75 |
+
except ValueError:
|
76 |
+
if class_name_or_id in name_to_id:
|
77 |
+
class_id = name_to_id[class_name_or_id]
|
78 |
+
elif class_str in name_to_id: # Check full string "id: name"
|
79 |
+
class_id = name_to_id[class_str]
|
80 |
+
|
81 |
+
if class_id != -1:
|
82 |
+
class_ids_to_filter.append(class_id)
|
83 |
+
else:
|
84 |
+
print(f"Warning: Could not parse class filter: {class_str}")
|
85 |
+
print(f"Filtering image results for class IDs: {class_ids_to_filter}")
|
86 |
+
|
87 |
+
# Call the existing image processing logic
|
88 |
result_image, result_text, stats = image_processor.process_image(
|
89 |
image,
|
90 |
model_name,
|
91 |
confidence_threshold,
|
92 |
+
class_ids_to_filter
|
93 |
)
|
94 |
|
95 |
+
# Format stats for JSON display
|
96 |
formatted_stats = image_processor.format_json_for_display(stats)
|
97 |
|
98 |
+
# Prepare visualization data for the plot
|
99 |
+
plot_figure = None
|
100 |
+
if stats and "class_statistics" in stats and stats["class_statistics"]:
|
101 |
+
available_classes_dict = dict(get_all_classes())
|
102 |
+
viz_data = image_processor.prepare_visualization_data(stats, available_classes_dict)
|
103 |
+
if "error" not in viz_data:
|
104 |
+
plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
|
105 |
+
else:
|
106 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
107 |
+
ax.text(0.5, 0.5, viz_data["error"], ha='center', va='center', fontsize=12)
|
108 |
+
ax.axis('off')
|
109 |
+
plot_figure = fig
|
110 |
+
else:
|
111 |
fig, ax = plt.subplots(figsize=(8, 6))
|
112 |
+
ax.text(0.5, 0.5, "No detection data for plot", ha='center', va='center', fontsize=12)
|
|
|
|
|
|
|
113 |
ax.axis('off')
|
114 |
plot_figure = fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# Extract scene analysis info
|
117 |
scene_analysis = stats.get("scene_analysis", {})
|
118 |
+
scene_desc = scene_analysis.get("description", "Scene analysis requires detected objects.")
|
119 |
+
# Ensure scene_desc is a string before adding HTML
|
120 |
+
if not isinstance(scene_desc, str):
|
121 |
+
scene_desc = str(scene_desc)
|
122 |
+
scene_desc_html = f"<div style='padding:10px; font-family:Arial, sans-serif; line-height:1.7;'>{scene_desc}</div>"
|
123 |
+
|
124 |
+
# Prepare activities list
|
125 |
+
activities_list = scene_analysis.get("possible_activities", [])
|
126 |
+
if not activities_list:
|
127 |
+
activities_list_data = [["No specific activities inferred"]] # Data for Dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
else:
|
129 |
+
activities_list_data = [[activity] for activity in activities_list]
|
130 |
|
131 |
+
# Prepare safety concerns list
|
132 |
+
safety_concerns_list = scene_analysis.get("safety_concerns", [])
|
133 |
+
if not safety_concerns_list:
|
134 |
+
safety_data = [["No safety concerns detected"]] # Data for Dataframe
|
135 |
else:
|
136 |
+
safety_data = [[concern] for concern in safety_concerns_list]
|
137 |
|
|
|
138 |
zones = scene_analysis.get("functional_zones", {})
|
139 |
+
lighting = scene_analysis.get("lighting_conditions", {"time_of_day": "unknown", "confidence": 0})
|
140 |
|
141 |
+
return (result_image, result_text, formatted_stats, plot_figure,
|
142 |
+
scene_desc_html, activities_list_data, safety_data, zones, lighting)
|
143 |
|
144 |
except Exception as e:
|
145 |
+
print(f"Error in handle_image_upload: {e}")
|
146 |
import traceback
|
147 |
error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
|
148 |
+
fig, ax = plt.subplots()
|
149 |
+
ax.text(0.5, 0.5, "Processing Error", color="red", ha="center", va="center")
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
ax.axis('off')
|
151 |
+
# Ensure return structure matches outputs even on error
|
152 |
+
return (None, error_msg, {}, fig, f"<div>Error: {str(e)}</div>",
|
153 |
+
[["Error"]], [["Error"]], {}, {"time_of_day": "error", "confidence": 0})
|
154 |
+
|
155 |
+
def download_video_from_url(video_url, max_duration_minutes=10):
|
156 |
+
"""
|
157 |
+
Downloads a video from a YouTube URL and returns the local path to the downloaded file.
|
158 |
+
|
159 |
+
Args:
|
160 |
+
video_url (str): URL of the YouTube video to download
|
161 |
+
max_duration_minutes (int): Maximum allowed video duration in minutes
|
162 |
+
|
163 |
+
Returns:
|
164 |
+
tuple: (Path to the downloaded video file or None, Error message or None)
|
165 |
+
"""
|
166 |
+
try:
|
167 |
+
# Create a temporary directory to store the video
|
168 |
+
temp_dir = tempfile.gettempdir()
|
169 |
+
output_filename = f"downloaded_{uuid.uuid4().hex}.mp4"
|
170 |
+
output_path = os.path.join(temp_dir, output_filename)
|
171 |
+
|
172 |
+
# Check if it's a YouTube URL
|
173 |
+
if "youtube.com" in video_url or "youtu.be" in video_url:
|
174 |
+
# Import yt-dlp here to avoid dependency if not needed
|
175 |
+
import yt_dlp
|
176 |
+
|
177 |
+
# Setup yt-dlp options
|
178 |
+
ydl_opts = {
|
179 |
+
'format': 'best[ext=mp4]/best', # Best quality MP4 or best available format
|
180 |
+
'outtmpl': output_path,
|
181 |
+
'noplaylist': True,
|
182 |
+
'quiet': False, # Set to True to reduce output
|
183 |
+
'no_warnings': False,
|
184 |
+
}
|
185 |
+
|
186 |
+
# First extract info to check duration
|
187 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
188 |
+
print(f"Extracting info from YouTube URL: {video_url}")
|
189 |
+
info_dict = ydl.extract_info(video_url, download=False)
|
190 |
+
|
191 |
+
# Check if video exists
|
192 |
+
if not info_dict:
|
193 |
+
return None, "Could not retrieve video information. Please check the URL."
|
194 |
+
|
195 |
+
video_title = info_dict.get('title', 'Unknown Title')
|
196 |
+
duration = info_dict.get('duration', 0)
|
197 |
+
|
198 |
+
print(f"Video title: {video_title}")
|
199 |
+
print(f"Video duration: {duration} seconds")
|
200 |
+
|
201 |
+
# Check video duration
|
202 |
+
if duration > max_duration_minutes * 60:
|
203 |
+
return None, f"Video is too long ({duration} seconds). Maximum duration is {max_duration_minutes} minutes."
|
204 |
+
|
205 |
+
# Download the video
|
206 |
+
print(f"Downloading YouTube video: {video_title}")
|
207 |
+
ydl.download([video_url])
|
208 |
+
|
209 |
+
# Verify the file exists and has content
|
210 |
+
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
|
211 |
+
return None, "Download failed: Empty or missing file."
|
212 |
+
|
213 |
+
print(f"Successfully downloaded video to: {output_path}")
|
214 |
+
return output_path, None
|
215 |
+
else:
|
216 |
+
return None, "Only YouTube URLs are supported at this time. Please enter a valid YouTube URL."
|
217 |
+
|
218 |
+
except Exception as e:
|
219 |
+
import traceback
|
220 |
+
error_details = traceback.format_exc()
|
221 |
+
print(f"Error downloading video: {e}\n{error_details}")
|
222 |
+
return None, f"Error downloading video: {str(e)}"
|
223 |
+
|
224 |
+
|
225 |
+
@spaces.GPU
|
226 |
+
def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
|
227 |
+
"""Handles video upload or URL input and calls the VideoProcessor."""
|
228 |
+
|
229 |
+
print(f"Received video request: input_type={input_type}")
|
230 |
+
video_path = None
|
231 |
+
|
232 |
+
# Handle based on input type
|
233 |
+
if input_type == "upload" and video_input:
|
234 |
+
print(f"Processing uploaded video file")
|
235 |
+
video_path = video_input
|
236 |
+
elif input_type == "url" and video_url:
|
237 |
+
print(f"Processing video from URL: {video_url}")
|
238 |
+
# Download video from URL
|
239 |
+
video_path, error_message = download_video_from_url(video_url)
|
240 |
+
if error_message:
|
241 |
+
error_html = f"<div class='video-summary-content-wrapper'><pre>{error_message}</pre></div>"
|
242 |
+
return None, error_html, {"error": error_message}
|
243 |
+
else:
|
244 |
+
print("No valid video input provided.")
|
245 |
+
return None, "<div class='video-summary-content-wrapper'><pre>Please upload a video file or provide a valid video URL.</pre></div>", {}
|
246 |
+
|
247 |
+
print(f"Starting video processing with: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
|
248 |
+
try:
|
249 |
+
# Call the VideoProcessor method
|
250 |
+
output_video_path, summary_text, stats_dict = video_processor.process_video_file(
|
251 |
+
video_path=video_path,
|
252 |
+
model_name=model_name,
|
253 |
+
confidence_threshold=confidence_threshold,
|
254 |
+
process_interval=int(process_interval) # Ensure interval is int
|
255 |
+
)
|
256 |
+
print(f"Video processing function returned: path={output_video_path}, summary length={len(summary_text)}")
|
257 |
+
|
258 |
+
# Wrap processing summary in HTML tags for consistent styling with scene understanding page
|
259 |
+
summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_text}</pre></div>"
|
260 |
+
|
261 |
+
# Format statistics for better display
|
262 |
+
formatted_stats = {}
|
263 |
+
if stats_dict and isinstance(stats_dict, dict):
|
264 |
+
formatted_stats = stats_dict
|
265 |
+
|
266 |
+
return output_video_path, summary_html, formatted_stats
|
267 |
+
|
268 |
+
except Exception as e:
|
269 |
+
print(f"Error in handle_video_upload: {e}")
|
270 |
+
import traceback
|
271 |
+
error_msg = f"Error processing video: {str(e)}\n{traceback.format_exc()}"
|
272 |
+
error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
|
273 |
+
return None, error_html, {"error": str(e)}
|
274 |
|
|
|
|
|
275 |
|
276 |
+
# Create Gradio Interface
|
277 |
def create_interface():
|
278 |
+
"""Creates the Gradio interface with Tabs."""
|
279 |
css = Style.get_css()
|
|
|
|
|
280 |
available_models = DetectionModel.get_available_models()
|
281 |
model_choices = [model["model_file"] for model in available_models]
|
282 |
+
class_choices_formatted = [f"{id}: {name}" for id, name in get_all_classes()] # Use formatted choices
|
|
|
|
|
|
|
|
|
283 |
|
|
|
284 |
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
|
285 |
+
|
286 |
+
# Header
|
287 |
with gr.Group(elem_classes="app-header"):
|
288 |
gr.HTML("""
|
289 |
<div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
|
290 |
<h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
|
291 |
+
<h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Object Detection and Scene Understanding</h2>
|
292 |
+
<div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
|
|
|
|
|
|
|
|
|
|
|
293 |
<div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
|
294 |
+
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
|
295 |
+
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
</div>
|
297 |
+
<div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
|
298 |
+
<p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
|
299 |
+
<span style="margin-right: 5px;">📱</span> iPhone users: HEIC images may not be supported.
|
300 |
+
<a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG</a> before uploading if needed.
|
301 |
+
</p>
|
302 |
+
</div>
|
303 |
</div>
|
304 |
""")
|
305 |
|
306 |
+
# Main Content with Tabs
|
307 |
+
with gr.Tabs(elem_classes="tabs"):
|
308 |
+
|
309 |
+
# Tab 1: Image Processing
|
310 |
+
with gr.Tab("Image Processing"):
|
311 |
+
current_image_model = gr.State("yolov8m.pt") # State for image model selection
|
312 |
+
with gr.Row(equal_height=False): # Allow columns to have different heights
|
313 |
+
# Left Column: Image Input & Controls
|
314 |
+
with gr.Column(scale=4, elem_classes="input-panel"):
|
315 |
+
with gr.Group():
|
316 |
+
gr.HTML('<div class="section-heading">Upload Image</div>')
|
317 |
+
image_input = gr.Image(type="pil", label="Upload an image", elem_classes="upload-box")
|
318 |
+
|
319 |
+
with gr.Accordion("Image Analysis Settings", open=False):
|
320 |
+
image_model_dropdown = gr.Dropdown(
|
321 |
+
choices=model_choices,
|
322 |
+
value="yolov8m.pt", # Default for images
|
323 |
+
label="Select Model",
|
324 |
+
info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
|
325 |
+
)
|
326 |
+
# Display model info
|
327 |
+
image_model_info = gr.Markdown(DetectionModel.get_model_description("yolov8m.pt"))
|
328 |
|
329 |
+
image_confidence = gr.Slider(
|
330 |
+
minimum=0.1, maximum=0.9, value=0.25, step=0.05,
|
331 |
+
label="Confidence Threshold",
|
332 |
+
info="Minimum confidence for displaying a detected object"
|
333 |
+
)
|
334 |
+
with gr.Accordion("Filter Classes", open=False):
|
335 |
+
gr.HTML('<div class="section-heading" style="font-size: 1rem;">Common Categories</div>')
|
336 |
+
with gr.Row():
|
337 |
+
people_btn = gr.Button("People", size="sm")
|
338 |
+
vehicles_btn = gr.Button("Vehicles", size="sm")
|
339 |
+
animals_btn = gr.Button("Animals", size="sm")
|
340 |
+
objects_btn = gr.Button("Common Objects", size="sm")
|
341 |
+
image_class_filter = gr.Dropdown(
|
342 |
+
choices=class_choices_formatted, # Use formatted choices
|
343 |
+
multiselect=True,
|
344 |
+
label="Select Classes to Display",
|
345 |
+
info="Leave empty to show all detected objects"
|
346 |
+
)
|
347 |
+
|
348 |
+
image_detect_btn = gr.Button("Analyze Image", variant="primary", elem_classes="detect-btn")
|
349 |
+
|
350 |
+
with gr.Group(elem_classes="how-to-use"):
|
351 |
+
gr.HTML('<div class="section-heading">How to Use (Image)</div>')
|
352 |
+
gr.Markdown("""
|
353 |
+
1. Upload an image or use the camera
|
354 |
+
2. (Optional) Adjust settings like confidence threshold or model size (n, m, x)
|
355 |
+
3. Optionally filter to specific object classes
|
356 |
+
4. Click **Detect Objects** button
|
357 |
+
""")
|
358 |
+
# Image Examples
|
359 |
+
gr.Examples(
|
360 |
+
examples=[
|
361 |
+
"room_01.jpg",
|
362 |
+
"room_02.jpg",
|
363 |
+
"street_02.jpg",
|
364 |
+
"street_04.jpg"
|
365 |
+
],
|
366 |
+
inputs=image_input,
|
367 |
+
label="Example Images"
|
368 |
+
)
|
369 |
+
|
370 |
+
# Right Column: Image Results
|
371 |
+
with gr.Column(scale=6, elem_classes="output-panel"):
|
372 |
+
with gr.Tabs(elem_classes="tabs"):
|
373 |
+
with gr.Tab("Detection Result"):
|
374 |
+
image_result_image = gr.Image(type="pil", label="Detection Result")
|
375 |
+
gr.HTML('<div class="section-heading">Detection Details</div>')
|
376 |
+
image_result_text = gr.Textbox(label=None, lines=10, elem_id="detection-details", container=False)
|
377 |
+
|
378 |
+
with gr.Tab("Scene Understanding"):
|
379 |
+
gr.HTML('<div class="section-heading">Scene Analysis</div>')
|
380 |
+
gr.HTML("""
|
381 |
+
<details class="info-details" style="margin: 5px 0 15px 0;">
|
382 |
+
<summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
|
383 |
+
🔍 The AI Vision Scout Report: Click for important notes about this analysis
|
384 |
+
</summary>
|
385 |
+
<div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
|
386 |
+
<p style="font-size: 13px; color: #718096; margin: 0;">
|
387 |
+
<b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
|
388 |
+
Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
|
389 |
+
Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
|
390 |
+
</p>
|
391 |
+
</div>
|
392 |
+
</details>
|
393 |
+
""")
|
394 |
+
|
395 |
+
# Wrap HTML description for potential styling
|
396 |
+
image_scene_description_html = gr.HTML(label=None, elem_id="scene_analysis_description_text")
|
397 |
+
|
398 |
+
with gr.Row():
|
399 |
+
with gr.Column(scale=1):
|
400 |
+
gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
|
401 |
+
image_activities_list = gr.Dataframe(headers=["Activity"], datatype=["str"], row_count=5, col_count=1, wrap=True)
|
402 |
+
|
403 |
+
with gr.Column(scale=1):
|
404 |
+
gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
|
405 |
+
image_safety_list = gr.Dataframe(headers=["Concern"], datatype=["str"], row_count=5, col_count=1, wrap=True)
|
406 |
+
|
407 |
+
gr.HTML('<div class="section-heading">Functional Zones</div>')
|
408 |
+
image_zones_json = gr.JSON(label=None, elem_classes="json-box")
|
409 |
+
|
410 |
+
gr.HTML('<div class="section-heading">Lighting Conditions</div>')
|
411 |
+
image_lighting_info = gr.JSON(label=None, elem_classes="json-box")
|
412 |
+
|
413 |
+
with gr.Tab("Statistics"):
|
414 |
+
with gr.Row():
|
415 |
+
with gr.Column(scale=3, elem_classes="plot-column"):
|
416 |
+
gr.HTML('<div class="section-heading">Object Distribution</div>')
|
417 |
+
image_plot_output = gr.Plot(label=None, elem_classes="large-plot-container")
|
418 |
+
with gr.Column(scale=2, elem_classes="stats-column"):
|
419 |
+
gr.HTML('<div class="section-heading">Detection Statistics</div>')
|
420 |
+
image_stats_json = gr.JSON(label=None, elem_classes="enhanced-json-display")
|
421 |
+
|
422 |
+
# Tab 2: Video Processing
|
423 |
+
with gr.Tab("Video Processing"):
|
424 |
+
with gr.Row(equal_height=False):
|
425 |
+
# Left Column: Video Input & Controls
|
426 |
+
with gr.Column(scale=4, elem_classes="input-panel"):
|
427 |
+
with gr.Group():
|
428 |
+
gr.HTML('<div class="section-heading">Video Input</div>')
|
429 |
+
|
430 |
+
# Add input type selection
|
431 |
+
video_input_type = gr.Radio(
|
432 |
+
["upload", "url"],
|
433 |
+
label="Input Method",
|
434 |
+
value="upload",
|
435 |
+
info="Choose how to provide the video"
|
436 |
)
|
437 |
|
438 |
+
# File upload (will be shown/hidden based on selection)
|
439 |
+
with gr.Group(elem_id="upload-video-group"):
|
440 |
+
video_input = gr.Video(
|
441 |
+
label="Upload a video file (MP4, AVI, MOV)",
|
442 |
+
sources=["upload"],
|
443 |
+
visible=True
|
444 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
|
446 |
+
# URL input (will be shown/hidden based on selection)
|
447 |
+
with gr.Group(elem_id="url-video-group"):
|
448 |
+
video_url_input = gr.Textbox(
|
449 |
+
label="Enter video URL (YouTube or direct video link)",
|
450 |
+
placeholder="https://www.youtube.com/watch?v=...",
|
451 |
+
visible=False,
|
452 |
+
elem_classes="custom-video-url-input"
|
453 |
+
)
|
454 |
+
gr.HTML("""
|
455 |
+
<div style="padding: 8px; margin-top: 5px; background-color: #fff8f8; border-radius: 4px; border-left: 3px solid #f87171; font-size: 12px;">
|
456 |
+
<p style="margin: 0; color: #4b5563;">
|
457 |
+
Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes.
|
|
|
|
|
458 |
</p>
|
459 |
</div>
|
460 |
+
""")
|
461 |
+
|
462 |
+
with gr.Accordion("Video Analysis Settings", open=True):
|
463 |
+
video_model_dropdown = gr.Dropdown(
|
464 |
+
choices=model_choices,
|
465 |
+
value="yolov8n.pt", # Default 'n' for video
|
466 |
+
label="Select Model (Video)",
|
467 |
+
info="Faster models (like 'n') are recommended"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
)
|
469 |
+
video_confidence = gr.Slider(
|
470 |
+
minimum=0.1, maximum=0.9, value=0.4, step=0.05,
|
471 |
+
label="Confidence Threshold (Video)"
|
|
|
|
|
|
|
|
|
472 |
)
|
473 |
+
video_process_interval = gr.Slider(
|
474 |
+
minimum=1, maximum=60, value=10, step=1, # Allow up to 60 frame interval
|
475 |
+
label="Processing Interval (Frames)",
|
476 |
+
info="Analyze every Nth frame (higher value = faster)"
|
477 |
+
)
|
478 |
+
video_process_btn = gr.Button("Process Video", variant="primary", elem_classes="detect-btn")
|
479 |
+
|
480 |
+
with gr.Group(elem_classes="how-to-use"):
|
481 |
+
gr.HTML('<div class="section-heading">How to Use (Video)</div>')
|
482 |
+
gr.Markdown("""
|
483 |
+
1. Choose your input method: Upload a file or enter a URL.
|
484 |
+
2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
|
485 |
+
3. Click "Process Video". **Processing can take a significant amount of time.**
|
486 |
+
4. The annotated video and summary will appear on the right when finished.
|
487 |
+
""")
|
488 |
|
489 |
+
# Add video examples
|
490 |
+
gr.HTML('<div class="section-heading">Example Videos</div>')
|
491 |
+
gr.HTML("""
|
492 |
+
<div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
|
493 |
+
<p style="font-size: 14px; color: #4A5568; margin: 0;">
|
494 |
+
Upload any video containing objects that YOLO can detect. For testing, find sample videos
|
495 |
+
<a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
|
496 |
+
</p>
|
497 |
+
</div>
|
498 |
+
""")
|
499 |
+
|
500 |
+
# Right Column: Video Results
|
501 |
+
with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
|
502 |
+
gr.HTML("""
|
503 |
+
<div class="section-heading">Video Result</div>
|
504 |
+
<details class="info-details" style="margin: 5px 0 15px 0;">
|
505 |
+
<summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
|
506 |
+
🎬 Video Processing Notes
|
507 |
+
</summary>
|
508 |
+
<div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
|
509 |
+
<p style="font-size: 13px; color: #718096; margin: 0;">
|
510 |
+
The processed video includes bounding boxes around detected objects. For longer videos,
|
511 |
+
consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
|
512 |
+
</p>
|
513 |
+
</div>
|
514 |
+
</details>
|
515 |
+
""")
|
516 |
+
video_output = gr.Video(label="Processed Video", elem_classes="video-output-container") # Output for the processed video file
|
517 |
+
|
518 |
+
gr.HTML('<div class="section-heading">Processing Summary</div>')
|
519 |
+
# 使用HTML顯示影片的摘要
|
520 |
+
video_summary_text = gr.HTML(
|
521 |
+
label=None,
|
522 |
+
elem_id="video-summary-html-output"
|
523 |
+
)
|
524 |
|
525 |
+
gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
|
526 |
+
video_stats_json = gr.JSON(label=None, elem_classes="video-stats-display") # Display statistics
|
|
|
|
|
|
|
527 |
|
528 |
+
# Event Listeners
|
529 |
+
# Image Model Change Handler
|
530 |
+
image_model_dropdown.change(
|
531 |
+
fn=lambda model: (model, DetectionModel.get_model_description(model)),
|
532 |
+
inputs=[image_model_dropdown],
|
533 |
+
outputs=[current_image_model, image_model_info] # Update state and description
|
534 |
)
|
535 |
|
536 |
+
# Image Filter Buttons
|
537 |
+
available_classes_list = get_all_classes() # Get list of (id, name)
|
538 |
+
people_classes_ids = [0]
|
539 |
+
vehicles_classes_ids = [1, 2, 3, 4, 5, 6, 7, 8]
|
540 |
+
animals_classes_ids = list(range(14, 24))
|
541 |
+
common_objects_ids = [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73] # Bottle, cup, fork, knife, spoon, bowl, chair, couch, table, tv, laptop, phone, book
|
542 |
+
|
543 |
+
people_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids], outputs=image_class_filter)
|
544 |
+
vehicles_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids], outputs=image_class_filter)
|
545 |
+
animals_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids], outputs=image_class_filter)
|
546 |
+
objects_btn.click(lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids], outputs=image_class_filter)
|
547 |
+
|
548 |
+
video_input_type.change(
|
549 |
+
fn=lambda input_type: [
|
550 |
+
# Show/hide file upload
|
551 |
+
gr.update(visible=(input_type == "upload")),
|
552 |
+
# Show/hide URL input
|
553 |
+
gr.update(visible=(input_type == "url"))
|
554 |
+
],
|
555 |
+
inputs=[video_input_type],
|
556 |
+
outputs=[video_input, video_url_input]
|
557 |
)
|
558 |
|
559 |
+
# Image Processing Button Click
|
560 |
+
image_detect_btn.click(
|
561 |
+
fn=handle_image_upload,
|
562 |
+
inputs=[image_input, image_model_dropdown, image_confidence, image_class_filter],
|
563 |
+
outputs=[
|
564 |
+
image_result_image, image_result_text, image_stats_json, image_plot_output,
|
565 |
+
image_scene_description_html, image_activities_list, image_safety_list, image_zones_json,
|
566 |
+
image_lighting_info
|
567 |
+
]
|
568 |
)
|
569 |
|
570 |
+
video_process_btn.click(
|
571 |
+
fn=handle_video_upload,
|
572 |
+
inputs=[
|
573 |
+
video_input,
|
574 |
+
video_url_input,
|
575 |
+
video_input_type,
|
576 |
+
video_model_dropdown,
|
577 |
+
video_confidence,
|
578 |
+
video_process_interval
|
579 |
+
],
|
580 |
+
outputs=[video_output, video_summary_text, video_stats_json]
|
|
|
|
|
|
|
581 |
)
|
582 |
|
|
|
583 |
# Footer
|
584 |
gr.HTML("""
|
585 |
+
<div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
|
586 |
+
<div style="margin-bottom: 15px;">
|
587 |
+
<p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP and Ultralytics • Created with Gradio</p>
|
588 |
+
</div>
|
589 |
+
<div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
|
590 |
+
<p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
|
591 |
+
<a href="https://github.com/Eric-Chung-0511/Learning-Record/tree/main/Data%20Science%20Projects/VisionScout" target="_blank" style="text-decoration: none;">
|
592 |
+
<img src="https://img.shields.io/badge/GitHub-VisionScout-4299e1?logo=github&style=for-the-badge">
|
593 |
+
</a>
|
594 |
+
</div>
|
595 |
+
</div>
|
596 |
+
""")
|
|
|
|
|
597 |
|
598 |
return demo
|
599 |
|
600 |
+
|
601 |
if __name__ == "__main__":
|
602 |
+
demo_interface = create_interface()
|
603 |
|
604 |
+
demo_interface.launch()
|
|
clip_analyzer.py
CHANGED
@@ -3,6 +3,7 @@ import clip
|
|
3 |
import numpy as np
|
4 |
from PIL import Image
|
5 |
from typing import Dict, List, Tuple, Any, Optional, Union
|
|
|
6 |
from clip_prompts import (
|
7 |
SCENE_TYPE_PROMPTS,
|
8 |
CULTURAL_SCENE_PROMPTS,
|
@@ -24,7 +25,7 @@ class CLIPAnalyzer:
|
|
24 |
初始化 CLIP 分析器。
|
25 |
|
26 |
Args:
|
27 |
-
model_name: CLIP Model name, "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
|
28 |
device: Use GPU if it can use
|
29 |
"""
|
30 |
# 自動選擇設備
|
|
|
3 |
import numpy as np
|
4 |
from PIL import Image
|
5 |
from typing import Dict, List, Tuple, Any, Optional, Union
|
6 |
+
|
7 |
from clip_prompts import (
|
8 |
SCENE_TYPE_PROMPTS,
|
9 |
CULTURAL_SCENE_PROMPTS,
|
|
|
25 |
初始化 CLIP 分析器。
|
26 |
|
27 |
Args:
|
28 |
+
model_name: CLIP Model name, "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
|
29 |
device: Use GPU if it can use
|
30 |
"""
|
31 |
# 自動選擇設備
|
enhance_scene_describer.py
CHANGED
@@ -126,7 +126,7 @@ class EnhancedSceneDescriber:
|
|
126 |
}
|
127 |
}
|
128 |
|
129 |
-
# 文化模板
|
130 |
if "cultural_templates" not in templates:
|
131 |
templates["cultural_templates"] = {
|
132 |
"asian": {
|
@@ -164,8 +164,8 @@ class EnhancedSceneDescriber:
|
|
164 |
"elevated_threshold": 0.6, # Objects mostly in middle/bottom
|
165 |
"elevated_top_threshold": 0.3 # Few objects at top of frame
|
166 |
}
|
167 |
-
|
168 |
-
|
169 |
def generate_description(self,
|
170 |
scene_type: str,
|
171 |
detected_objects: List[Dict],
|
@@ -190,26 +190,23 @@ class EnhancedSceneDescriber:
|
|
190 |
"""
|
191 |
# Handle unknown scene type or very low confidence
|
192 |
if scene_type == "unknown" or confidence < 0.4:
|
193 |
-
return self._generate_generic_description(detected_objects, lighting_info)
|
194 |
|
195 |
# Detect viewpoint
|
196 |
-
viewpoint = self._detect_viewpoint(detected_objects)
|
197 |
|
|
|
198 |
if viewpoint == "aerial":
|
199 |
-
# 如果是十字路口相關的場景,確保使用正確的空中視角十字路口場景類型
|
200 |
if "intersection" in scene_type or self._is_intersection(detected_objects):
|
201 |
scene_type = "aerial_view_intersection"
|
202 |
-
# 如果是商業區相關的場景
|
203 |
elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
|
204 |
scene_type = "aerial_view_commercial_area"
|
205 |
-
# 如果是廣場相關的場景
|
206 |
elif any(keyword in scene_type for keyword in ["plaza", "square"]):
|
207 |
scene_type = "aerial_view_plaza"
|
208 |
-
# 其他空中視角場景,預設使用十字路口
|
209 |
else:
|
210 |
scene_type = "aerial_view_intersection"
|
211 |
|
212 |
-
# Detect cultural context -
|
213 |
cultural_context = None
|
214 |
if viewpoint != "aerial":
|
215 |
cultural_context = self._detect_cultural_context(scene_type, detected_objects)
|
@@ -224,7 +221,6 @@ class EnhancedSceneDescriber:
|
|
224 |
|
225 |
# Get base description for the scene type
|
226 |
if viewpoint == "aerial":
|
227 |
-
# 空中視角時使用已設定的基本描述
|
228 |
if 'base_description' not in locals():
|
229 |
base_description = "An aerial view showing the layout and movement patterns from above"
|
230 |
elif scene_type in self.scene_types:
|
@@ -240,25 +236,38 @@ class EnhancedSceneDescriber:
|
|
240 |
viewpoint
|
241 |
)
|
242 |
|
243 |
-
#
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
if people_objs:
|
246 |
people_count = len(people_objs)
|
247 |
if people_count > 5:
|
248 |
-
# 當人數很多時,用更精確的措辭
|
249 |
people_phrase = f"numerous people ({people_count})"
|
250 |
else:
|
251 |
people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
|
252 |
|
253 |
-
#
|
254 |
-
if "people" not in
|
255 |
-
|
256 |
|
257 |
-
# Apply cultural context if detected (
|
258 |
-
if cultural_context and
|
259 |
cultural_elements = self._generate_cultural_elements(cultural_context)
|
260 |
if cultural_elements:
|
261 |
-
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
# Include lighting information if available
|
264 |
lighting_description = ""
|
@@ -267,22 +276,25 @@ class EnhancedSceneDescriber:
|
|
267 |
if lighting_type in self.templates.get("lighting_templates", {}):
|
268 |
lighting_description = self.templates["lighting_templates"][lighting_type]
|
269 |
|
270 |
-
#
|
271 |
-
|
272 |
-
|
273 |
-
)
|
274 |
-
|
275 |
-
# Fill the template
|
276 |
-
description = description_template.format(
|
277 |
-
description=base_description,
|
278 |
-
details=scene_details
|
279 |
-
)
|
280 |
|
281 |
-
#
|
282 |
if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
|
283 |
viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
|
284 |
|
285 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
if viewpoint == "aerial":
|
287 |
scene_elements = "the crossing patterns and pedestrian movement"
|
288 |
else:
|
@@ -292,93 +304,269 @@ class EnhancedSceneDescriber:
|
|
292 |
scene_elements=scene_elements
|
293 |
)
|
294 |
|
295 |
-
# Add viewpoint prefix if needed
|
296 |
-
if not description.startswith(viewpoint_template.get("prefix", "")):
|
297 |
-
description = f"{viewpoint_template.get('prefix', '')}{description}"
|
298 |
-
|
299 |
# Add viewpoint observation if not already included
|
300 |
-
if viewpoint_desc not in description:
|
301 |
-
description
|
302 |
-
|
303 |
-
# Add lighting description if available
|
304 |
-
if lighting_description and lighting_description not in description:
|
305 |
-
description += f" {lighting_description}"
|
306 |
|
307 |
# Add information about functional zones if available
|
308 |
if functional_zones and len(functional_zones) > 0:
|
309 |
zones_desc = self._describe_functional_zones(functional_zones)
|
310 |
if zones_desc:
|
311 |
-
description
|
312 |
|
313 |
-
#
|
314 |
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
|
315 |
|
316 |
-
#
|
317 |
if people_count > 5:
|
318 |
-
#
|
319 |
small_people_patterns = [
|
320 |
r"Area with \d+ people\.",
|
321 |
r"Area with \d+ person\.",
|
322 |
r"with \d+ people",
|
323 |
r"with \d+ person"
|
324 |
]
|
325 |
-
|
|
|
326 |
filtered_description = description
|
327 |
for pattern in small_people_patterns:
|
328 |
matches = re.findall(pattern, filtered_description)
|
329 |
for match in matches:
|
330 |
-
#
|
331 |
number_match = re.search(r'\d+', match)
|
332 |
if number_match:
|
333 |
try:
|
334 |
people_mentioned = int(number_match.group())
|
335 |
-
#
|
336 |
if people_mentioned < people_count:
|
337 |
-
#
|
338 |
sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
|
339 |
-
#
|
340 |
filtered_sentences = []
|
341 |
for sentence in sentences:
|
342 |
if match not in sentence:
|
343 |
filtered_sentences.append(sentence)
|
344 |
-
#
|
345 |
filtered_description = " ".join(filtered_sentences)
|
346 |
except ValueError:
|
347 |
-
#
|
348 |
continue
|
349 |
|
350 |
-
#
|
351 |
description = filtered_description
|
352 |
|
|
|
|
|
|
|
353 |
return description
|
354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
def _is_intersection(self, detected_objects: List[Dict]) -> bool:
|
356 |
"""
|
357 |
通過分析物體分佈來判斷場景是否為十字路口
|
358 |
"""
|
359 |
# 檢查行人分佈模式
|
360 |
pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
|
361 |
-
|
362 |
if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口
|
363 |
# 抓取行人位置
|
364 |
positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
|
365 |
-
|
366 |
# 分析 x 和 y 坐標分佈
|
367 |
x_coords = [pos[0] for pos in positions]
|
368 |
y_coords = [pos[1] for pos in positions]
|
369 |
-
|
370 |
# 計算 x 和 y 坐標的變異數
|
371 |
x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
|
372 |
y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
|
373 |
-
|
374 |
# 計算範圍
|
375 |
x_range = max(x_coords) - min(x_coords)
|
376 |
y_range = max(y_coords) - min(y_coords)
|
377 |
-
|
378 |
# 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口
|
379 |
if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
|
380 |
return True
|
381 |
-
|
382 |
return False
|
383 |
|
384 |
def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
|
@@ -1165,27 +1353,27 @@ class EnhancedSceneDescriber:
|
|
1165 |
優化物品描述,避免重複列舉相同物品
|
1166 |
"""
|
1167 |
import re
|
1168 |
-
|
1169 |
# 處理床鋪重複描述
|
1170 |
if "bed in the room" in description:
|
1171 |
description = description.replace("a bed in the room", "a bed")
|
1172 |
-
|
1173 |
# 處理重複的物品列表
|
1174 |
# 尋找格式如 "item, item, item" 的模式
|
1175 |
object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
|
1176 |
-
|
1177 |
for obj_list in object_lists:
|
1178 |
# 計算每個物品出現次數
|
1179 |
items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
|
1180 |
item_counts = {}
|
1181 |
-
|
1182 |
for item in items:
|
1183 |
item = item.strip()
|
1184 |
if item and item not in ["and", "with"]:
|
1185 |
if item not in item_counts:
|
1186 |
item_counts[item] = 0
|
1187 |
item_counts[item] += 1
|
1188 |
-
|
1189 |
# 生成優化後的物品列表
|
1190 |
if item_counts:
|
1191 |
new_items = []
|
@@ -1194,7 +1382,7 @@ class EnhancedSceneDescriber:
|
|
1194 |
new_items.append(f"{count} {item}s")
|
1195 |
else:
|
1196 |
new_items.append(item)
|
1197 |
-
|
1198 |
# 格式化新列表
|
1199 |
if len(new_items) == 1:
|
1200 |
new_list = new_items[0]
|
@@ -1202,10 +1390,10 @@ class EnhancedSceneDescriber:
|
|
1202 |
new_list = f"{new_items[0]} and {new_items[1]}"
|
1203 |
else:
|
1204 |
new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
|
1205 |
-
|
1206 |
# 替換原始列表
|
1207 |
description = description.replace(obj_list, new_list)
|
1208 |
-
|
1209 |
return description
|
1210 |
|
1211 |
def _describe_functional_zones(self, functional_zones: Dict) -> str:
|
@@ -1288,7 +1476,7 @@ class EnhancedSceneDescriber:
|
|
1288 |
|
1289 |
# 根據處理後的區域數量生成最終描述
|
1290 |
final_desc = ""
|
1291 |
-
|
1292 |
if len(processed_zones) == 1:
|
1293 |
_, zone_info = processed_zones[0]
|
1294 |
zone_desc = zone_info["description"]
|
|
|
126 |
}
|
127 |
}
|
128 |
|
129 |
+
# 文化模板
|
130 |
if "cultural_templates" not in templates:
|
131 |
templates["cultural_templates"] = {
|
132 |
"asian": {
|
|
|
164 |
"elevated_threshold": 0.6, # Objects mostly in middle/bottom
|
165 |
"elevated_top_threshold": 0.3 # Few objects at top of frame
|
166 |
}
|
167 |
+
|
168 |
+
|
169 |
def generate_description(self,
|
170 |
scene_type: str,
|
171 |
detected_objects: List[Dict],
|
|
|
190 |
"""
|
191 |
# Handle unknown scene type or very low confidence
|
192 |
if scene_type == "unknown" or confidence < 0.4:
|
193 |
+
return self._format_final_description(self._generate_generic_description(detected_objects, lighting_info))
|
194 |
|
195 |
# Detect viewpoint
|
196 |
+
viewpoint = self._detect_viewpoint(detected_objects)
|
197 |
|
198 |
+
# Process aerial viewpoint scene types
|
199 |
if viewpoint == "aerial":
|
|
|
200 |
if "intersection" in scene_type or self._is_intersection(detected_objects):
|
201 |
scene_type = "aerial_view_intersection"
|
|
|
202 |
elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
|
203 |
scene_type = "aerial_view_commercial_area"
|
|
|
204 |
elif any(keyword in scene_type for keyword in ["plaza", "square"]):
|
205 |
scene_type = "aerial_view_plaza"
|
|
|
206 |
else:
|
207 |
scene_type = "aerial_view_intersection"
|
208 |
|
209 |
+
# Detect cultural context - only for non-aerial viewpoints
|
210 |
cultural_context = None
|
211 |
if viewpoint != "aerial":
|
212 |
cultural_context = self._detect_cultural_context(scene_type, detected_objects)
|
|
|
221 |
|
222 |
# Get base description for the scene type
|
223 |
if viewpoint == "aerial":
|
|
|
224 |
if 'base_description' not in locals():
|
225 |
base_description = "An aerial view showing the layout and movement patterns from above"
|
226 |
elif scene_type in self.scene_types:
|
|
|
236 |
viewpoint
|
237 |
)
|
238 |
|
239 |
+
# Start with the base description
|
240 |
+
description = base_description
|
241 |
+
|
242 |
+
# If there's a secondary description from the scene type template, append it properly
|
243 |
+
if scene_type in self.scene_types and "secondary_description" in self.scene_types[scene_type]:
|
244 |
+
secondary_desc = self.scene_types[scene_type]["secondary_description"]
|
245 |
+
if secondary_desc:
|
246 |
+
description = self._smart_append(description, secondary_desc)
|
247 |
+
|
248 |
+
# Improve description based on people count
|
249 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # Person class
|
250 |
if people_objs:
|
251 |
people_count = len(people_objs)
|
252 |
if people_count > 5:
|
|
|
253 |
people_phrase = f"numerous people ({people_count})"
|
254 |
else:
|
255 |
people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
|
256 |
|
257 |
+
# Add people information to the scene details if not already mentioned
|
258 |
+
if "people" not in description.lower() and "pedestrian" not in description.lower():
|
259 |
+
description = self._smart_append(description, f"The scene includes {people_phrase}")
|
260 |
|
261 |
+
# Apply cultural context if detected (only for non-aerial viewpoints)
|
262 |
+
if cultural_context and viewpoint != "aerial":
|
263 |
cultural_elements = self._generate_cultural_elements(cultural_context)
|
264 |
if cultural_elements:
|
265 |
+
description = self._smart_append(description, cultural_elements)
|
266 |
+
|
267 |
+
# Now append the detailed scene information if available
|
268 |
+
if scene_details:
|
269 |
+
# Use smart_append to ensure proper formatting between base description and details
|
270 |
+
description = self._smart_append(description, scene_details)
|
271 |
|
272 |
# Include lighting information if available
|
273 |
lighting_description = ""
|
|
|
276 |
if lighting_type in self.templates.get("lighting_templates", {}):
|
277 |
lighting_description = self.templates["lighting_templates"][lighting_type]
|
278 |
|
279 |
+
# Add lighting description if available
|
280 |
+
if lighting_description and lighting_description not in description:
|
281 |
+
description = self._smart_append(description, lighting_description)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
+
# Process viewpoint information
|
284 |
if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
|
285 |
viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
|
286 |
|
287 |
+
# Special handling for viewpoint prefix
|
288 |
+
prefix = viewpoint_template.get('prefix', '')
|
289 |
+
if prefix and not description.startswith(prefix):
|
290 |
+
# Prefix is a phrase like "From above, " that should precede the description
|
291 |
+
if description and description[0].isupper():
|
292 |
+
# Maintain the flow by lowercasing the first letter after the prefix
|
293 |
+
description = prefix + description[0].lower() + description[1:]
|
294 |
+
else:
|
295 |
+
description = prefix + description
|
296 |
+
|
297 |
+
# Get appropriate scene elements description based on viewpoint
|
298 |
if viewpoint == "aerial":
|
299 |
scene_elements = "the crossing patterns and pedestrian movement"
|
300 |
else:
|
|
|
304 |
scene_elements=scene_elements
|
305 |
)
|
306 |
|
|
|
|
|
|
|
|
|
307 |
# Add viewpoint observation if not already included
|
308 |
+
if viewpoint_desc and viewpoint_desc not in description:
|
309 |
+
description = self._smart_append(description, viewpoint_desc)
|
|
|
|
|
|
|
|
|
310 |
|
311 |
# Add information about functional zones if available
|
312 |
if functional_zones and len(functional_zones) > 0:
|
313 |
zones_desc = self._describe_functional_zones(functional_zones)
|
314 |
if zones_desc:
|
315 |
+
description = self._smart_append(description, zones_desc)
|
316 |
|
317 |
+
# Calculate actual people count
|
318 |
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
|
319 |
|
320 |
+
# Check for inconsistencies in people count descriptions
|
321 |
if people_count > 5:
|
322 |
+
# Identify fragments that might contain smaller people counts
|
323 |
small_people_patterns = [
|
324 |
r"Area with \d+ people\.",
|
325 |
r"Area with \d+ person\.",
|
326 |
r"with \d+ people",
|
327 |
r"with \d+ person"
|
328 |
]
|
329 |
+
|
330 |
+
# Check and remove each pattern
|
331 |
filtered_description = description
|
332 |
for pattern in small_people_patterns:
|
333 |
matches = re.findall(pattern, filtered_description)
|
334 |
for match in matches:
|
335 |
+
# Extract the number from the match
|
336 |
number_match = re.search(r'\d+', match)
|
337 |
if number_match:
|
338 |
try:
|
339 |
people_mentioned = int(number_match.group())
|
340 |
+
# If the mentioned count is less than total, remove the entire sentence
|
341 |
if people_mentioned < people_count:
|
342 |
+
# Split description into sentences
|
343 |
sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
|
344 |
+
# Remove sentences containing the match
|
345 |
filtered_sentences = []
|
346 |
for sentence in sentences:
|
347 |
if match not in sentence:
|
348 |
filtered_sentences.append(sentence)
|
349 |
+
# Recombine the description
|
350 |
filtered_description = " ".join(filtered_sentences)
|
351 |
except ValueError:
|
352 |
+
# Failed number conversion, continue processing
|
353 |
continue
|
354 |
|
355 |
+
# Use the filtered description
|
356 |
description = filtered_description
|
357 |
|
358 |
+
# Final formatting to ensure correct punctuation and capitalization
|
359 |
+
description = self._format_final_description(description)
|
360 |
+
|
361 |
return description
|
362 |
|
363 |
+
def _smart_append(self, current_text: str, new_fragment: str) -> str:
|
364 |
+
"""
|
365 |
+
Intelligently append a new text fragment to the current text,
|
366 |
+
handling punctuation and capitalization correctly.
|
367 |
+
|
368 |
+
Args:
|
369 |
+
current_text: The existing text to append to
|
370 |
+
new_fragment: The new text fragment to append
|
371 |
+
|
372 |
+
Returns:
|
373 |
+
str: The combined text with proper formatting
|
374 |
+
"""
|
375 |
+
# Handle empty cases
|
376 |
+
if not new_fragment:
|
377 |
+
return current_text
|
378 |
+
|
379 |
+
if not current_text:
|
380 |
+
# Ensure first character is uppercase for the first fragment
|
381 |
+
return new_fragment[0].upper() + new_fragment[1:] if new_fragment else ""
|
382 |
+
|
383 |
+
# Clean up existing text
|
384 |
+
current_text = current_text.rstrip()
|
385 |
+
|
386 |
+
# Check for ending punctuation
|
387 |
+
ends_with_sentence = current_text.endswith(('.', '!', '?'))
|
388 |
+
ends_with_comma = current_text.endswith(',')
|
389 |
+
|
390 |
+
# Specifically handle the "A xxx A yyy" pattern that's causing issues
|
391 |
+
if (current_text.startswith("A ") or current_text.startswith("An ")) and \
|
392 |
+
(new_fragment.startswith("A ") or new_fragment.startswith("An ")):
|
393 |
+
return current_text + ". " + new_fragment
|
394 |
+
|
395 |
+
# Decide how to join the texts
|
396 |
+
if ends_with_sentence:
|
397 |
+
# After a sentence, start with uppercase and add proper spacing
|
398 |
+
joined_text = current_text + " " + (new_fragment[0].upper() + new_fragment[1:])
|
399 |
+
elif ends_with_comma:
|
400 |
+
# After a comma, maintain flow with lowercase unless it's a proper noun or special case
|
401 |
+
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
|
402 |
+
joined_text = current_text + " " + new_fragment
|
403 |
+
else:
|
404 |
+
joined_text = current_text + " " + new_fragment[0].lower() + new_fragment[1:]
|
405 |
+
elif "scene is" in new_fragment.lower() or "scene includes" in new_fragment.lower():
|
406 |
+
# When adding a new sentence about the scene, use a period
|
407 |
+
joined_text = current_text + ". " + new_fragment
|
408 |
+
else:
|
409 |
+
# For other cases, decide based on the content
|
410 |
+
if self._is_related_phrases(current_text, new_fragment):
|
411 |
+
if new_fragment.startswith(('I ', 'I\'', 'A ', 'An ', 'The ')) or new_fragment[0].isupper():
|
412 |
+
joined_text = current_text + ", " + new_fragment
|
413 |
+
else:
|
414 |
+
joined_text = current_text + ", " + new_fragment[0].lower() + new_fragment[1:]
|
415 |
+
else:
|
416 |
+
# Use period for unrelated phrases
|
417 |
+
joined_text = current_text + ". " + (new_fragment[0].upper() + new_fragment[1:])
|
418 |
+
|
419 |
+
return joined_text
|
420 |
+
|
421 |
+
def _is_related_phrases(self, text1: str, text2: str) -> bool:
|
422 |
+
"""
|
423 |
+
Determine if two phrases are related and should be connected with a comma
|
424 |
+
rather than separated with a period.
|
425 |
+
|
426 |
+
Args:
|
427 |
+
text1: The first text fragment
|
428 |
+
text2: The second text fragment to be appended
|
429 |
+
|
430 |
+
Returns:
|
431 |
+
bool: Whether the phrases appear to be related
|
432 |
+
"""
|
433 |
+
# Check if either phrase starts with "A" or "An" - these are likely separate descriptions
|
434 |
+
if (text1.startswith("A ") or text1.startswith("An ")) and \
|
435 |
+
(text2.startswith("A ") or text2.startswith("An ")):
|
436 |
+
return False # These are separate descriptions, not related phrases
|
437 |
+
|
438 |
+
# Check if the second phrase starts with a connecting word
|
439 |
+
connecting_words = ["which", "where", "who", "whom", "whose", "with", "without",
|
440 |
+
"this", "these", "that", "those", "and", "or", "but"]
|
441 |
+
|
442 |
+
first_word = text2.split()[0].lower() if text2 else ""
|
443 |
+
if first_word in connecting_words:
|
444 |
+
return True
|
445 |
+
|
446 |
+
# Check if the first phrase ends with something that suggests continuity
|
447 |
+
ending_patterns = ["such as", "including", "like", "especially", "particularly",
|
448 |
+
"for example", "for instance", "namely", "specifically"]
|
449 |
+
|
450 |
+
for pattern in ending_patterns:
|
451 |
+
if text1.lower().endswith(pattern):
|
452 |
+
return True
|
453 |
+
|
454 |
+
# Check if both phrases are about the scene
|
455 |
+
if "scene" in text1.lower() and "scene" in text2.lower():
|
456 |
+
return False # Separate statements about the scene should be separate sentences
|
457 |
+
|
458 |
+
return False
|
459 |
+
|
460 |
+
def _format_final_description(self, text: str) -> str:
|
461 |
+
"""
|
462 |
+
Format the final description text to ensure correct punctuation,
|
463 |
+
capitalization, and spacing.
|
464 |
+
|
465 |
+
Args:
|
466 |
+
text: The text to format
|
467 |
+
|
468 |
+
Returns:
|
469 |
+
str: The properly formatted text
|
470 |
+
"""
|
471 |
+
import re
|
472 |
+
|
473 |
+
if not text:
|
474 |
+
return ""
|
475 |
+
|
476 |
+
# 1. 特別處理連續以"A"開頭的片段 (這是一個常見問題)
|
477 |
+
text = re.sub(r'(A\s[^.!?]+?)\s+(A\s)', r'\1. \2', text, flags=re.IGNORECASE)
|
478 |
+
text = re.sub(r'(An\s[^.!?]+?)\s+(An?\s)', r'\1. \2', text, flags=re.IGNORECASE)
|
479 |
+
|
480 |
+
# 2. 確保第一個字母大寫
|
481 |
+
text = text[0].upper() + text[1:] if text else ""
|
482 |
+
|
483 |
+
# 3. 修正詞之間的空格問題
|
484 |
+
text = re.sub(r'\s{2,}', ' ', text) # 多個空格改為一個
|
485 |
+
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # 小寫後大寫間加空格
|
486 |
+
|
487 |
+
# 4. 修正詞連接問題
|
488 |
+
text = re.sub(r'([a-zA-Z])and', r'\1 and', text) # "xxx"和"and"間加空格
|
489 |
+
text = re.sub(r'([a-zA-Z])with', r'\1 with', text) # "xxx"和"with"間加空格
|
490 |
+
text = re.sub(r'plants(and|with|or)', r'plants \1', text) # 修正"plantsand"這類問題
|
491 |
+
|
492 |
+
# 5. 修正標點符號後的大小寫問題
|
493 |
+
text = re.sub(r'\.(\s+)([a-z])', lambda m: f'.{m.group(1)}{m.group(2).upper()}', text) # 句號後大寫
|
494 |
+
|
495 |
+
# 6. 修正逗號後接大寫單詞的問題
|
496 |
+
def fix_capitalization_after_comma(match):
|
497 |
+
word = match.group(2)
|
498 |
+
# 例外情況:保留專有名詞、人稱代詞等的大寫
|
499 |
+
if word in ["I", "I'm", "I've", "I'd", "I'll"]:
|
500 |
+
return match.group(0) # 保持原樣
|
501 |
+
|
502 |
+
# 保留月份、星期、地名等專有名詞的大寫
|
503 |
+
proper_nouns = ["January", "February", "March", "April", "May", "June", "July",
|
504 |
+
"August", "September", "October", "November", "December",
|
505 |
+
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
|
506 |
+
if word in proper_nouns:
|
507 |
+
return match.group(0) # 保持原樣
|
508 |
+
|
509 |
+
# 其他情況:將首字母改為小寫
|
510 |
+
return match.group(1) + word[0].lower() + word[1:]
|
511 |
+
|
512 |
+
# 匹配逗號後接空格再接大寫單詞的模式
|
513 |
+
text = re.sub(r'(,\s+)([A-Z][a-zA-Z]*)', fix_capitalization_after_comma, text)
|
514 |
+
|
515 |
+
|
516 |
+
common_phrases = [
|
517 |
+
(r'Social or seating area', r'social or seating area'),
|
518 |
+
(r'Sleeping area', r'sleeping area'),
|
519 |
+
(r'Dining area', r'dining area'),
|
520 |
+
(r'Living space', r'living space')
|
521 |
+
]
|
522 |
+
|
523 |
+
for phrase, replacement in common_phrases:
|
524 |
+
# 只修改句中的術語,保留句首的大寫
|
525 |
+
text = re.sub(r'(?<=[.!?]\s)' + phrase, replacement, text)
|
526 |
+
# 修改句中的術語,但保留句首的大寫
|
527 |
+
text = re.sub(r'(?<=,\s)' + phrase, replacement, text)
|
528 |
+
|
529 |
+
# 7. 確保標點符號後有空格
|
530 |
+
text = re.sub(r'\s+([.,;:!?])', r'\1', text) # 標點符號前不要空格
|
531 |
+
text = re.sub(r'([.,;:!?])([a-zA-Z0-9])', r'\1 \2', text) # 標點符號後要有空格
|
532 |
+
|
533 |
+
# 8. 修正重複標點符號
|
534 |
+
text = re.sub(r'\.{2,}', '.', text) # 多個句號變一個
|
535 |
+
text = re.sub(r',{2,}', ',', text) # 多個逗號變一個
|
536 |
+
|
537 |
+
# 9. 確保文本以標點結束
|
538 |
+
if text and not text[-1] in '.!?':
|
539 |
+
text += '.'
|
540 |
+
|
541 |
+
return text
|
542 |
+
|
543 |
def _is_intersection(self, detected_objects: List[Dict]) -> bool:
|
544 |
"""
|
545 |
通過分析物體分佈來判斷場景是否為十字路口
|
546 |
"""
|
547 |
# 檢查行人分佈模式
|
548 |
pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
|
549 |
+
|
550 |
if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口
|
551 |
# 抓取行人位置
|
552 |
positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
|
553 |
+
|
554 |
# 分析 x 和 y 坐標分佈
|
555 |
x_coords = [pos[0] for pos in positions]
|
556 |
y_coords = [pos[1] for pos in positions]
|
557 |
+
|
558 |
# 計算 x 和 y 坐標的變異數
|
559 |
x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
|
560 |
y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
|
561 |
+
|
562 |
# 計算範圍
|
563 |
x_range = max(x_coords) - min(x_coords)
|
564 |
y_range = max(y_coords) - min(y_coords)
|
565 |
+
|
566 |
# 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口
|
567 |
if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
|
568 |
return True
|
569 |
+
|
570 |
return False
|
571 |
|
572 |
def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
|
|
|
1353 |
優化物品描述,避免重複列舉相同物品
|
1354 |
"""
|
1355 |
import re
|
1356 |
+
|
1357 |
# 處理床鋪重複描述
|
1358 |
if "bed in the room" in description:
|
1359 |
description = description.replace("a bed in the room", "a bed")
|
1360 |
+
|
1361 |
# 處理重複的物品列表
|
1362 |
# 尋找格式如 "item, item, item" 的模式
|
1363 |
object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
|
1364 |
+
|
1365 |
for obj_list in object_lists:
|
1366 |
# 計算每個物品出現次數
|
1367 |
items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
|
1368 |
item_counts = {}
|
1369 |
+
|
1370 |
for item in items:
|
1371 |
item = item.strip()
|
1372 |
if item and item not in ["and", "with"]:
|
1373 |
if item not in item_counts:
|
1374 |
item_counts[item] = 0
|
1375 |
item_counts[item] += 1
|
1376 |
+
|
1377 |
# 生成優化後的物品列表
|
1378 |
if item_counts:
|
1379 |
new_items = []
|
|
|
1382 |
new_items.append(f"{count} {item}s")
|
1383 |
else:
|
1384 |
new_items.append(item)
|
1385 |
+
|
1386 |
# 格式化新列表
|
1387 |
if len(new_items) == 1:
|
1388 |
new_list = new_items[0]
|
|
|
1390 |
new_list = f"{new_items[0]} and {new_items[1]}"
|
1391 |
else:
|
1392 |
new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
|
1393 |
+
|
1394 |
# 替換原始列表
|
1395 |
description = description.replace(obj_list, new_list)
|
1396 |
+
|
1397 |
return description
|
1398 |
|
1399 |
def _describe_functional_zones(self, functional_zones: Dict) -> str:
|
|
|
1476 |
|
1477 |
# 根據處理後的區域數量生成最終描述
|
1478 |
final_desc = ""
|
1479 |
+
|
1480 |
if len(processed_zones) == 1:
|
1481 |
_, zone_info = processed_zones[0]
|
1482 |
zone_desc = zone_info["description"]
|
lighting_analyzer.py
CHANGED
@@ -151,11 +151,11 @@ class LightingAnalyzer:
|
|
151 |
|
152 |
avg_saturation = np.mean(s_channel)
|
153 |
|
154 |
-
# 天空亮度
|
155 |
upper_half = v_channel[:height//2, :]
|
156 |
sky_brightness = np.mean(upper_half)
|
157 |
|
158 |
-
# 色調分析
|
159 |
warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
|
160 |
warm_ratio = np.sum(warm_colors) / (height * width)
|
161 |
|
@@ -186,16 +186,16 @@ class LightingAnalyzer:
|
|
186 |
top_scale = scale_factor * 2 # 更積極的下採樣
|
187 |
top_region = v_channel[:height//4:top_scale, ::top_scale]
|
188 |
top_region_std = np.std(top_region)
|
189 |
-
ceiling_uniformity = 1.0 - min(1
|
190 |
|
191 |
# 使用更簡單的方法檢測上部水平線
|
192 |
top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
|
193 |
horizontal_lines_strength = np.mean(top_gradients)
|
194 |
# 標準化
|
195 |
-
horizontal_line_ratio = min(1
|
196 |
|
197 |
# 極簡的亮點檢測
|
198 |
-
sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
|
199 |
light_threshold = min(220, avg_brightness + 2*brightness_std)
|
200 |
is_bright = sampled_v > light_threshold
|
201 |
bright_spot_count = np.sum(is_bright)
|
@@ -203,7 +203,7 @@ class LightingAnalyzer:
|
|
203 |
# 圓形光源分析的簡化替代方法
|
204 |
circular_light_score = 0
|
205 |
indoor_light_score = 0
|
206 |
-
light_distribution_uniformity = 0.5
|
207 |
|
208 |
# 只有當檢測到亮點,且不是大量亮點時(可能是室外光反射)才進行光源分析
|
209 |
if 1 < bright_spot_count < 20:
|
@@ -227,7 +227,7 @@ class LightingAnalyzer:
|
|
227 |
indoor_light_score = 0.3
|
228 |
|
229 |
# 使用邊緣區域梯度來快速估計邊界
|
230 |
-
edge_scale = scale_factor * 2
|
231 |
|
232 |
# 只採樣圖像邊緣部分進行分析
|
233 |
left_edge = small_gray[:, :small_gray.shape[1]//6]
|
@@ -240,15 +240,15 @@ class LightingAnalyzer:
|
|
240 |
top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
|
241 |
|
242 |
# 標準化
|
243 |
-
left_edge_density = min(1.0, left_gradient / 50
|
244 |
-
right_edge_density = min(1.0, right_gradient / 50
|
245 |
-
top_edge_density = min(1.0, top_gradient / 50
|
246 |
|
247 |
# 封閉環境通常在圖像邊緣有較強的梯度
|
248 |
boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
|
249 |
|
250 |
# 簡單估計整體邊緣密度
|
251 |
-
edges_density = min(1
|
252 |
|
253 |
street_line_score = 0
|
254 |
|
@@ -319,16 +319,16 @@ class LightingAnalyzer:
|
|
319 |
# 1. 藍色區域(天空)特徵 - 藍色區域多通常表示室外
|
320 |
if features.get("blue_ratio", 0) > 0.2:
|
321 |
# 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
|
322 |
-
if (features.get("ceiling_uniformity", 0) > 0.5 or
|
323 |
-
features.get("boundary_edge_score", 0) > 0.3 or
|
324 |
features.get("indoor_light_score", 0) > 0.2 or
|
325 |
features.get("bright_spot_count", 0) > 0):
|
326 |
-
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
|
327 |
else:
|
328 |
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
|
329 |
else:
|
330 |
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
|
331 |
-
|
332 |
indoor_score += blue_score
|
333 |
feature_contributions["blue_ratio"] = blue_score
|
334 |
|
@@ -351,14 +351,14 @@ class LightingAnalyzer:
|
|
351 |
horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
|
352 |
|
353 |
# 增強天花板檢測的影響
|
354 |
-
if ceiling_uniformity > 0.5:
|
355 |
-
ceiling_weight = 3
|
356 |
-
ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
|
357 |
if horizontal_line_ratio > 0.2: # 如果有水平線條,進一步增強
|
358 |
-
ceiling_contribution *= 1.5
|
359 |
-
elif ceiling_uniformity > 0.4:
|
360 |
-
ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
|
361 |
-
|
362 |
indoor_score += ceiling_contribution
|
363 |
feature_contributions["ceiling_features"] = ceiling_contribution
|
364 |
|
@@ -370,7 +370,7 @@ class LightingAnalyzer:
|
|
370 |
|
371 |
# 加強對特定類型光源的檢測
|
372 |
if circular_light_count >= 1: # 即便只有一個圓形光源也很可能是室內
|
373 |
-
light_contribution = weights.get("light_features", 1.2) * 2.0
|
374 |
elif indoor_light_score > 0.3:
|
375 |
light_contribution = weights.get("light_features", 1.2) * 1.0
|
376 |
|
@@ -384,11 +384,11 @@ class LightingAnalyzer:
|
|
384 |
edges_density = features.get("edges_density", 0)
|
385 |
|
386 |
# 高邊界評分暗示封閉環境(室內)
|
387 |
-
if boundary_edge_score > 0.3:
|
388 |
-
boundary_contribution = weights.get("boundary_features", 1.2) * 2
|
389 |
-
elif boundary_edge_score > 0.2:
|
390 |
-
boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
|
391 |
-
|
392 |
indoor_score += boundary_contribution
|
393 |
feature_contributions["boundary_features"] = boundary_contribution
|
394 |
|
@@ -415,7 +415,7 @@ class LightingAnalyzer:
|
|
415 |
combined_uniformity = (features["brightness_uniformity"] +
|
416 |
features.get("ceiling_uniformity", 0)) / 2
|
417 |
|
418 |
-
if combined_uniformity > 0.5:
|
419 |
gradient_contribution = weights["gradient_ratio"] * 0.7
|
420 |
else:
|
421 |
gradient_contribution = -weights["gradient_ratio"] * 0.3
|
@@ -430,7 +430,7 @@ class LightingAnalyzer:
|
|
430 |
|
431 |
# 調整亮點分析邏輯
|
432 |
if circular_light_count >= 1: # 即使只有一個圓形光源
|
433 |
-
bright_spot_contribution = weights["bright_spots"] * 1.5
|
434 |
elif bright_spot_count < 5: # 適當放寬閾值
|
435 |
bright_spot_contribution = weights["bright_spots"] * 0.5
|
436 |
elif bright_spot_count > 15: # 大量亮點比較有可能為室外
|
@@ -441,8 +441,8 @@ class LightingAnalyzer:
|
|
441 |
|
442 |
# 8. 色調分析
|
443 |
yellow_contribution = 0
|
444 |
-
if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
|
445 |
-
if features.get("indoor_light_score", 0) > 0.2:
|
446 |
yellow_contribution = weights["color_tone"] * 0.8
|
447 |
else:
|
448 |
yellow_contribution = weights["color_tone"] * 0.5
|
@@ -452,10 +452,10 @@ class LightingAnalyzer:
|
|
452 |
|
453 |
if features.get("blue_ratio", 0) > 0.7:
|
454 |
# 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
|
455 |
-
if (features.get("ceiling_uniformity", 0) > 0.6 or
|
456 |
-
features.get("boundary_edge_score", 0) > 0.3 or
|
457 |
features.get("indoor_light_score", 0) > 0):
|
458 |
-
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
|
459 |
else:
|
460 |
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
|
461 |
else:
|
@@ -534,19 +534,19 @@ class LightingAnalyzer:
|
|
534 |
# 1: 窗戶和牆壁形成的直角
|
535 |
if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
|
536 |
bedroom_indicators += 1.5 # 增加權重
|
537 |
-
|
538 |
# 2: 天花板和光源
|
539 |
if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
|
540 |
-
bedroom_indicators += 2.5
|
541 |
-
|
542 |
# 3: 良好對比度的牆壁顏色,適合臥房還有客廳
|
543 |
if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
|
544 |
-
bedroom_indicators += 1.5
|
545 |
-
|
546 |
# 特殊的檢測 4: 檢測窗戶
|
547 |
if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
|
548 |
-
bedroom_indicators += 1.5
|
549 |
-
|
550 |
# 如果滿足足夠的家居指標,提高多點室內判斷分數
|
551 |
if bedroom_indicators >= 3:
|
552 |
# 增加��居環境評分
|
@@ -576,11 +576,11 @@ class LightingAnalyzer:
|
|
576 |
def _determine_lighting_conditions(self, features, is_indoor):
|
577 |
"""
|
578 |
基於特徵和室內/室外判斷確定光照條件。
|
579 |
-
|
580 |
Args:
|
581 |
features: 特徵字典
|
582 |
is_indoor: 是否是室內環境
|
583 |
-
|
584 |
Returns:
|
585 |
Dict: 光照條件分析結果
|
586 |
"""
|
@@ -588,37 +588,37 @@ class LightingAnalyzer:
|
|
588 |
time_of_day = "unknown"
|
589 |
confidence = 0.5
|
590 |
diagnostics = {}
|
591 |
-
|
592 |
avg_brightness = features["avg_brightness"]
|
593 |
dark_pixel_ratio = features["dark_pixel_ratio"]
|
594 |
yellow_orange_ratio = features["yellow_orange_ratio"]
|
595 |
blue_ratio = features["blue_ratio"]
|
596 |
gray_ratio = features["gray_ratio"]
|
597 |
-
|
598 |
# 基於室內/室外分別判斷
|
599 |
if is_indoor:
|
600 |
# 計算室內住宅自然光指標
|
601 |
natural_window_light = 0
|
602 |
-
|
603 |
# 檢查窗戶特徵和光線特性
|
604 |
-
if (features.get("blue_ratio", 0) > 0.1 and
|
605 |
features.get("sky_brightness", 0) > avg_brightness * 1.1):
|
606 |
natural_window_light += 1
|
607 |
-
|
608 |
# 檢查均勻柔和的光線分布
|
609 |
-
if (features.get("brightness_uniformity", 0) > 0.65 and
|
610 |
features.get("brightness_std", 0) < 70):
|
611 |
natural_window_light += 1
|
612 |
-
|
613 |
# 檢查暖色調比例
|
614 |
if features.get("warm_ratio", 0) > 0.2:
|
615 |
natural_window_light += 1
|
616 |
-
|
617 |
# 家居環境指標
|
618 |
home_env_score = features.get("home_environment_pattern", 0)
|
619 |
if home_env_score > 1.5:
|
620 |
natural_window_light += 1
|
621 |
-
|
622 |
# 1. 室內明亮環境,可能有窗戶自然光
|
623 |
if avg_brightness > 130:
|
624 |
# 檢測自然光住宅空間 - 新增類型!
|
@@ -645,7 +645,7 @@ class LightingAnalyzer:
|
|
645 |
time_of_day = "indoor_dim"
|
646 |
confidence = 0.65 + dark_pixel_ratio / 3
|
647 |
diagnostics["reason"] = "Low brightness in indoor environment"
|
648 |
-
|
649 |
# 1. 檢測設計師風格住宅,可以偵測到比較多種類的狀況
|
650 |
designer_residential_score = 0
|
651 |
# 檢測特色燈具
|
@@ -660,19 +660,19 @@ class LightingAnalyzer:
|
|
660 |
# 檢測家居環境特徵
|
661 |
if home_env_score > 1.5:
|
662 |
designer_residential_score += 1
|
663 |
-
|
664 |
if designer_residential_score >= 3 and home_env_score > 1.5:
|
665 |
-
time_of_day = "indoor_designer_residential"
|
666 |
confidence = 0.85
|
667 |
diagnostics["special_case"] = "Designer residential lighting with decorative elements"
|
668 |
-
|
669 |
# 2. 檢測餐廳/酒吧場景
|
670 |
elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
|
671 |
if features["warm_ratio"] > 0.4:
|
672 |
time_of_day = "indoor_restaurant"
|
673 |
confidence = 0.65 + yellow_orange_ratio / 4
|
674 |
diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
|
675 |
-
|
676 |
# 3. 檢測商業照明空間
|
677 |
elif avg_brightness > 120 and features["bright_spot_count"] > 4:
|
678 |
# 增加商業照明判別的精確度
|
@@ -685,7 +685,7 @@ class LightingAnalyzer:
|
|
685 |
# 整體照明結構化布局
|
686 |
if features.get("light_distribution_uniformity", 0) > 0.6:
|
687 |
commercial_score += 0.5
|
688 |
-
|
689 |
if commercial_score > 0.6 and designer_residential_score < 3:
|
690 |
time_of_day = "indoor_commercial"
|
691 |
confidence = 0.7 + commercial_score / 5
|
@@ -794,18 +794,18 @@ class LightingAnalyzer:
|
|
794 |
"""
|
795 |
return {
|
796 |
"indoor_outdoor_weights": {
|
797 |
-
"blue_ratio": 0.6,
|
798 |
-
"brightness_uniformity": 1.2,
|
799 |
-
"gradient_ratio": 0.7,
|
800 |
-
"bright_spots": 0.8,
|
801 |
-
"color_tone": 0.5,
|
802 |
-
"sky_brightness": 0.9,
|
803 |
-
"brightness_variation": 0.7,
|
804 |
-
"ceiling_features": 1.5,
|
805 |
-
"light_features": 1.1,
|
806 |
-
"boundary_features": 2.8,
|
807 |
-
"street_features": 2
|
808 |
-
"building_features": 1.6
|
809 |
},
|
810 |
"include_diagnostics": True
|
811 |
}
|
|
|
151 |
|
152 |
avg_saturation = np.mean(s_channel)
|
153 |
|
154 |
+
# 天空亮度
|
155 |
upper_half = v_channel[:height//2, :]
|
156 |
sky_brightness = np.mean(upper_half)
|
157 |
|
158 |
+
# 色調分析
|
159 |
warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
|
160 |
warm_ratio = np.sum(warm_colors) / (height * width)
|
161 |
|
|
|
186 |
top_scale = scale_factor * 2 # 更積極的下採樣
|
187 |
top_region = v_channel[:height//4:top_scale, ::top_scale]
|
188 |
top_region_std = np.std(top_region)
|
189 |
+
ceiling_uniformity = 1.0 - min(1, top_region_std / max(np.mean(top_region), 1e-5))
|
190 |
|
191 |
# 使用更簡單的方法檢測上部水平線
|
192 |
top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
|
193 |
horizontal_lines_strength = np.mean(top_gradients)
|
194 |
# 標準化
|
195 |
+
horizontal_line_ratio = min(1, horizontal_lines_strength / 40)
|
196 |
|
197 |
# 極簡的亮點檢測
|
198 |
+
sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
|
199 |
light_threshold = min(220, avg_brightness + 2*brightness_std)
|
200 |
is_bright = sampled_v > light_threshold
|
201 |
bright_spot_count = np.sum(is_bright)
|
|
|
203 |
# 圓形光源分析的簡化替代方法
|
204 |
circular_light_score = 0
|
205 |
indoor_light_score = 0
|
206 |
+
light_distribution_uniformity = 0.5
|
207 |
|
208 |
# 只有當檢測到亮點,且不是大量亮點時(可能是室外光反射)才進行光源分析
|
209 |
if 1 < bright_spot_count < 20:
|
|
|
227 |
indoor_light_score = 0.3
|
228 |
|
229 |
# 使用邊緣區域梯度來快速估計邊界
|
230 |
+
edge_scale = scale_factor * 2
|
231 |
|
232 |
# 只採樣圖像邊緣部分進行分析
|
233 |
left_edge = small_gray[:, :small_gray.shape[1]//6]
|
|
|
240 |
top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
|
241 |
|
242 |
# 標準化
|
243 |
+
left_edge_density = min(1.0, left_gradient / 50)
|
244 |
+
right_edge_density = min(1.0, right_gradient / 50)
|
245 |
+
top_edge_density = min(1.0, top_gradient / 50)
|
246 |
|
247 |
# 封閉環境通常在圖像邊緣有較強的梯度
|
248 |
boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
|
249 |
|
250 |
# 簡單估計整體邊緣密度
|
251 |
+
edges_density = min(1, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100)
|
252 |
|
253 |
street_line_score = 0
|
254 |
|
|
|
319 |
# 1. 藍色區域(天空)特徵 - 藍色區域多通常表示室外
|
320 |
if features.get("blue_ratio", 0) > 0.2:
|
321 |
# 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
|
322 |
+
if (features.get("ceiling_uniformity", 0) > 0.5 or
|
323 |
+
features.get("boundary_edge_score", 0) > 0.3 or
|
324 |
features.get("indoor_light_score", 0) > 0.2 or
|
325 |
features.get("bright_spot_count", 0) > 0):
|
326 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
|
327 |
else:
|
328 |
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
|
329 |
else:
|
330 |
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
|
331 |
+
|
332 |
indoor_score += blue_score
|
333 |
feature_contributions["blue_ratio"] = blue_score
|
334 |
|
|
|
351 |
horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
|
352 |
|
353 |
# 增強天花板檢測的影響
|
354 |
+
if ceiling_uniformity > 0.5:
|
355 |
+
ceiling_weight = 3
|
356 |
+
ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
|
357 |
if horizontal_line_ratio > 0.2: # 如果有水平線條,進一步增強
|
358 |
+
ceiling_contribution *= 1.5
|
359 |
+
elif ceiling_uniformity > 0.4:
|
360 |
+
ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
|
361 |
+
|
362 |
indoor_score += ceiling_contribution
|
363 |
feature_contributions["ceiling_features"] = ceiling_contribution
|
364 |
|
|
|
370 |
|
371 |
# 加強對特定類型光源的檢測
|
372 |
if circular_light_count >= 1: # 即便只有一個圓形光源也很可能是室內
|
373 |
+
light_contribution = weights.get("light_features", 1.2) * 2.0
|
374 |
elif indoor_light_score > 0.3:
|
375 |
light_contribution = weights.get("light_features", 1.2) * 1.0
|
376 |
|
|
|
384 |
edges_density = features.get("edges_density", 0)
|
385 |
|
386 |
# 高邊界評分暗示封閉環境(室內)
|
387 |
+
if boundary_edge_score > 0.3:
|
388 |
+
boundary_contribution = weights.get("boundary_features", 1.2) * 2
|
389 |
+
elif boundary_edge_score > 0.2:
|
390 |
+
boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
|
391 |
+
|
392 |
indoor_score += boundary_contribution
|
393 |
feature_contributions["boundary_features"] = boundary_contribution
|
394 |
|
|
|
415 |
combined_uniformity = (features["brightness_uniformity"] +
|
416 |
features.get("ceiling_uniformity", 0)) / 2
|
417 |
|
418 |
+
if combined_uniformity > 0.5:
|
419 |
gradient_contribution = weights["gradient_ratio"] * 0.7
|
420 |
else:
|
421 |
gradient_contribution = -weights["gradient_ratio"] * 0.3
|
|
|
430 |
|
431 |
# 調整亮點分析邏輯
|
432 |
if circular_light_count >= 1: # 即使只有一個圓形光源
|
433 |
+
bright_spot_contribution = weights["bright_spots"] * 1.5
|
434 |
elif bright_spot_count < 5: # 適當放寬閾值
|
435 |
bright_spot_contribution = weights["bright_spots"] * 0.5
|
436 |
elif bright_spot_count > 15: # 大量亮點比較有可能為室外
|
|
|
441 |
|
442 |
# 8. 色調分析
|
443 |
yellow_contribution = 0
|
444 |
+
if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
|
445 |
+
if features.get("indoor_light_score", 0) > 0.2:
|
446 |
yellow_contribution = weights["color_tone"] * 0.8
|
447 |
else:
|
448 |
yellow_contribution = weights["color_tone"] * 0.5
|
|
|
452 |
|
453 |
if features.get("blue_ratio", 0) > 0.7:
|
454 |
# 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
|
455 |
+
if (features.get("ceiling_uniformity", 0) > 0.6 or
|
456 |
+
features.get("boundary_edge_score", 0) > 0.3 or
|
457 |
features.get("indoor_light_score", 0) > 0):
|
458 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
|
459 |
else:
|
460 |
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
|
461 |
else:
|
|
|
534 |
# 1: 窗戶和牆壁形成的直角
|
535 |
if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
|
536 |
bedroom_indicators += 1.5 # 增加權重
|
537 |
+
|
538 |
# 2: 天花板和光源
|
539 |
if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
|
540 |
+
bedroom_indicators += 2.5
|
541 |
+
|
542 |
# 3: 良好對比度的牆壁顏色,適合臥房還有客廳
|
543 |
if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
|
544 |
+
bedroom_indicators += 1.5
|
545 |
+
|
546 |
# 特殊的檢測 4: 檢測窗戶
|
547 |
if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
|
548 |
+
bedroom_indicators += 1.5
|
549 |
+
|
550 |
# 如果滿足足夠的家居指標,提高多點室內判斷分數
|
551 |
if bedroom_indicators >= 3:
|
552 |
# 增加��居環境評分
|
|
|
576 |
def _determine_lighting_conditions(self, features, is_indoor):
|
577 |
"""
|
578 |
基於特徵和室內/室外判斷確定光照條件。
|
579 |
+
|
580 |
Args:
|
581 |
features: 特徵字典
|
582 |
is_indoor: 是否是室內環境
|
583 |
+
|
584 |
Returns:
|
585 |
Dict: 光照條件分析結果
|
586 |
"""
|
|
|
588 |
time_of_day = "unknown"
|
589 |
confidence = 0.5
|
590 |
diagnostics = {}
|
591 |
+
|
592 |
avg_brightness = features["avg_brightness"]
|
593 |
dark_pixel_ratio = features["dark_pixel_ratio"]
|
594 |
yellow_orange_ratio = features["yellow_orange_ratio"]
|
595 |
blue_ratio = features["blue_ratio"]
|
596 |
gray_ratio = features["gray_ratio"]
|
597 |
+
|
598 |
# 基於室內/室外分別判斷
|
599 |
if is_indoor:
|
600 |
# 計算室內住宅自然光指標
|
601 |
natural_window_light = 0
|
602 |
+
|
603 |
# 檢查窗戶特徵和光線特性
|
604 |
+
if (features.get("blue_ratio", 0) > 0.1 and
|
605 |
features.get("sky_brightness", 0) > avg_brightness * 1.1):
|
606 |
natural_window_light += 1
|
607 |
+
|
608 |
# 檢查均勻柔和的光線分布
|
609 |
+
if (features.get("brightness_uniformity", 0) > 0.65 and
|
610 |
features.get("brightness_std", 0) < 70):
|
611 |
natural_window_light += 1
|
612 |
+
|
613 |
# 檢查暖色調比例
|
614 |
if features.get("warm_ratio", 0) > 0.2:
|
615 |
natural_window_light += 1
|
616 |
+
|
617 |
# 家居環境指標
|
618 |
home_env_score = features.get("home_environment_pattern", 0)
|
619 |
if home_env_score > 1.5:
|
620 |
natural_window_light += 1
|
621 |
+
|
622 |
# 1. 室內明亮環境,可能有窗戶自然光
|
623 |
if avg_brightness > 130:
|
624 |
# 檢測自然光住宅空間 - 新增類型!
|
|
|
645 |
time_of_day = "indoor_dim"
|
646 |
confidence = 0.65 + dark_pixel_ratio / 3
|
647 |
diagnostics["reason"] = "Low brightness in indoor environment"
|
648 |
+
|
649 |
# 1. 檢測設計師風格住宅,可以偵測到比較多種類的狀況
|
650 |
designer_residential_score = 0
|
651 |
# 檢測特色燈具
|
|
|
660 |
# 檢測家居環境特徵
|
661 |
if home_env_score > 1.5:
|
662 |
designer_residential_score += 1
|
663 |
+
|
664 |
if designer_residential_score >= 3 and home_env_score > 1.5:
|
665 |
+
time_of_day = "indoor_designer_residential"
|
666 |
confidence = 0.85
|
667 |
diagnostics["special_case"] = "Designer residential lighting with decorative elements"
|
668 |
+
|
669 |
# 2. 檢測餐廳/酒吧場景
|
670 |
elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
|
671 |
if features["warm_ratio"] > 0.4:
|
672 |
time_of_day = "indoor_restaurant"
|
673 |
confidence = 0.65 + yellow_orange_ratio / 4
|
674 |
diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
|
675 |
+
|
676 |
# 3. 檢測商業照明空間
|
677 |
elif avg_brightness > 120 and features["bright_spot_count"] > 4:
|
678 |
# 增加商業照明判別的精確度
|
|
|
685 |
# 整體照明結構化布局
|
686 |
if features.get("light_distribution_uniformity", 0) > 0.6:
|
687 |
commercial_score += 0.5
|
688 |
+
|
689 |
if commercial_score > 0.6 and designer_residential_score < 3:
|
690 |
time_of_day = "indoor_commercial"
|
691 |
confidence = 0.7 + commercial_score / 5
|
|
|
794 |
"""
|
795 |
return {
|
796 |
"indoor_outdoor_weights": {
|
797 |
+
"blue_ratio": 0.6,
|
798 |
+
"brightness_uniformity": 1.2,
|
799 |
+
"gradient_ratio": 0.7,
|
800 |
+
"bright_spots": 0.8,
|
801 |
+
"color_tone": 0.5,
|
802 |
+
"sky_brightness": 0.9,
|
803 |
+
"brightness_variation": 0.7,
|
804 |
+
"ceiling_features": 1.5,
|
805 |
+
"light_features": 1.1,
|
806 |
+
"boundary_features": 2.8,
|
807 |
+
"street_features": 2,
|
808 |
+
"building_features": 1.6
|
809 |
},
|
810 |
"include_diagnostics": True
|
811 |
}
|
requirements.txt
CHANGED
@@ -7,3 +7,5 @@ numpy>=1.23.5
|
|
7 |
matplotlib>=3.7.0
|
8 |
gradio>=3.32.0
|
9 |
git+https://github.com/openai/CLIP.git
|
|
|
|
|
|
7 |
matplotlib>=3.7.0
|
8 |
gradio>=3.32.0
|
9 |
git+https://github.com/openai/CLIP.git
|
10 |
+
yt-dlp>=2023.3.4
|
11 |
+
requests>=2.28.1
|
scene_analyzer.py
CHANGED
@@ -17,7 +17,6 @@ class SceneAnalyzer:
|
|
17 |
def __init__(self, class_names: Dict[int, str] = None):
|
18 |
"""
|
19 |
Initialize the scene analyzer with optional class name mappings.
|
20 |
-
|
21 |
Args:
|
22 |
class_names: Dictionary mapping class IDs to class names (optional)
|
23 |
"""
|
@@ -49,14 +48,12 @@ class SceneAnalyzer:
|
|
49 |
functional_zones=None):
|
50 |
"""
|
51 |
生成場景描述。
|
52 |
-
|
53 |
Args:
|
54 |
scene_type: 識別的場景類型
|
55 |
detected_objects: 檢測到的物體列表
|
56 |
confidence: 場景分類置信度
|
57 |
lighting_info: 照明條件信息(可選)
|
58 |
functional_zones: 功能區域信息(可選)
|
59 |
-
|
60 |
Returns:
|
61 |
str: 生成的場景描述
|
62 |
"""
|
@@ -101,13 +98,11 @@ class SceneAnalyzer:
|
|
101 |
def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
|
102 |
"""
|
103 |
Analyze detection results to determine scene type and provide understanding.
|
104 |
-
|
105 |
Args:
|
106 |
detection_result: Detection result from YOLOv8
|
107 |
lighting_info: Optional lighting condition analysis results
|
108 |
class_confidence_threshold: Minimum confidence to consider an object
|
109 |
scene_confidence_threshold: Minimum confidence to determine a scene
|
110 |
-
|
111 |
Returns:
|
112 |
Dictionary with scene analysis results
|
113 |
"""
|
@@ -141,7 +136,7 @@ class SceneAnalyzer:
|
|
141 |
if not detected_objects:
|
142 |
return {
|
143 |
"scene_type": "unknown",
|
144 |
-
"confidence": 0
|
145 |
"description": "No objects with sufficient confidence detected.",
|
146 |
"objects_present": [],
|
147 |
"object_count": 0,
|
@@ -265,10 +260,8 @@ class SceneAnalyzer:
|
|
265 |
def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
|
266 |
"""
|
267 |
Compute confidence scores for each scene type based on detected objects.
|
268 |
-
|
269 |
Args:
|
270 |
detected_objects: List of detected objects
|
271 |
-
|
272 |
Returns:
|
273 |
Dictionary mapping scene types to confidence scores
|
274 |
"""
|
@@ -308,7 +301,7 @@ class SceneAnalyzer:
|
|
308 |
optional_score = optional_ratio * 0.3 # 30% of score from optional objects
|
309 |
|
310 |
# Bonus for having multiple instances of key objects
|
311 |
-
multiple_bonus = 0
|
312 |
for class_id in required_present:
|
313 |
if class_counts.get(class_id, 0) > 1:
|
314 |
multiple_bonus += 0.05 # 5% bonus per additional key object type
|
@@ -330,10 +323,8 @@ class SceneAnalyzer:
|
|
330 |
def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
|
331 |
"""
|
332 |
Determine the most likely scene type based on scores.
|
333 |
-
|
334 |
Args:
|
335 |
scene_scores: Dictionary mapping scene types to confidence scores
|
336 |
-
|
337 |
Returns:
|
338 |
Tuple of (best_scene_type, confidence)
|
339 |
"""
|
@@ -350,11 +341,9 @@ class SceneAnalyzer:
|
|
350 |
def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
|
351 |
"""
|
352 |
融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
|
353 |
-
|
354 |
Args:
|
355 |
yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
|
356 |
clip_scene_scores: 基於 CLIP 分析的場景分數
|
357 |
-
|
358 |
Returns:
|
359 |
Dict: 融合後的場景分數
|
360 |
"""
|
|
|
17 |
def __init__(self, class_names: Dict[int, str] = None):
|
18 |
"""
|
19 |
Initialize the scene analyzer with optional class name mappings.
|
|
|
20 |
Args:
|
21 |
class_names: Dictionary mapping class IDs to class names (optional)
|
22 |
"""
|
|
|
48 |
functional_zones=None):
|
49 |
"""
|
50 |
生成場景描述。
|
|
|
51 |
Args:
|
52 |
scene_type: 識別的場景類型
|
53 |
detected_objects: 檢測到的物體列表
|
54 |
confidence: 場景分類置信度
|
55 |
lighting_info: 照明條件信息(可選)
|
56 |
functional_zones: 功能區域信息(可選)
|
|
|
57 |
Returns:
|
58 |
str: 生成的場景描述
|
59 |
"""
|
|
|
98 |
def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
|
99 |
"""
|
100 |
Analyze detection results to determine scene type and provide understanding.
|
|
|
101 |
Args:
|
102 |
detection_result: Detection result from YOLOv8
|
103 |
lighting_info: Optional lighting condition analysis results
|
104 |
class_confidence_threshold: Minimum confidence to consider an object
|
105 |
scene_confidence_threshold: Minimum confidence to determine a scene
|
|
|
106 |
Returns:
|
107 |
Dictionary with scene analysis results
|
108 |
"""
|
|
|
136 |
if not detected_objects:
|
137 |
return {
|
138 |
"scene_type": "unknown",
|
139 |
+
"confidence": 0,
|
140 |
"description": "No objects with sufficient confidence detected.",
|
141 |
"objects_present": [],
|
142 |
"object_count": 0,
|
|
|
260 |
def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
|
261 |
"""
|
262 |
Compute confidence scores for each scene type based on detected objects.
|
|
|
263 |
Args:
|
264 |
detected_objects: List of detected objects
|
|
|
265 |
Returns:
|
266 |
Dictionary mapping scene types to confidence scores
|
267 |
"""
|
|
|
301 |
optional_score = optional_ratio * 0.3 # 30% of score from optional objects
|
302 |
|
303 |
# Bonus for having multiple instances of key objects
|
304 |
+
multiple_bonus = 0
|
305 |
for class_id in required_present:
|
306 |
if class_counts.get(class_id, 0) > 1:
|
307 |
multiple_bonus += 0.05 # 5% bonus per additional key object type
|
|
|
323 |
def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
|
324 |
"""
|
325 |
Determine the most likely scene type based on scores.
|
|
|
326 |
Args:
|
327 |
scene_scores: Dictionary mapping scene types to confidence scores
|
|
|
328 |
Returns:
|
329 |
Tuple of (best_scene_type, confidence)
|
330 |
"""
|
|
|
341 |
def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
|
342 |
"""
|
343 |
融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
|
|
|
344 |
Args:
|
345 |
yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
|
346 |
clip_scene_scores: 基於 CLIP 分析的場景分數
|
|
|
347 |
Returns:
|
348 |
Dict: 融合後的場景分數
|
349 |
"""
|
style.py
CHANGED
@@ -268,6 +268,40 @@ class Style:
|
|
268 |
padding: 0 !important;
|
269 |
}
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
/* 結果容器樣式 */
|
272 |
.result-container {
|
273 |
width: 100% !important;
|
@@ -356,6 +390,111 @@ class Style:
|
|
356 |
box-sizing: border-box !important;
|
357 |
}
|
358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
/* 響應式調整 */
|
360 |
@media (max-width: 768px) {
|
361 |
.app-title {
|
@@ -375,5 +514,6 @@ class Style:
|
|
375 |
min-height: 150px !important;
|
376 |
}
|
377 |
}
|
|
|
378 |
"""
|
379 |
return css
|
|
|
268 |
padding: 0 !important;
|
269 |
}
|
270 |
|
271 |
+
/* 場景分析描述區域樣式 */
|
272 |
+
.scene-description-box {
|
273 |
+
background-color: #f8f9fa !important;
|
274 |
+
border: 1px solid #e2e8f0 !important;
|
275 |
+
border-radius: 8px !important;
|
276 |
+
padding: 15px !important;
|
277 |
+
margin: 10px 0 20px 0 !important;
|
278 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.05) !important;
|
279 |
+
font-family: Arial, sans-serif !important;
|
280 |
+
line-height: 1.7 !important;
|
281 |
+
color: #2D3748 !important;
|
282 |
+
font-size: 16px !important;
|
283 |
+
width: 100% !important;
|
284 |
+
box-sizing: border-box !important;
|
285 |
+
}
|
286 |
+
|
287 |
+
#scene_analysis_description_text {
|
288 |
+
background-color: #f0f0f0 !important; /* 淺灰色背景 */
|
289 |
+
padding: 15px !important; /* 內邊距,讓文字和邊框有點空間 */
|
290 |
+
border-radius: 8px !important; /* 圓角 */
|
291 |
+
margin: 10px 0 20px 0 !important; /* 其他元素的間距,特別是上下的part */
|
292 |
+
display: block !important;
|
293 |
+
width: 100% !important;
|
294 |
+
box-sizing: border-box !important;
|
295 |
+
}
|
296 |
+
|
297 |
+
#scene_analysis_description_text p {
|
298 |
+
margin: 0 !important;
|
299 |
+
color: #2D3748 !important; /* 確保文字顏色 */
|
300 |
+
font-family: Arial, sans-serif !important;
|
301 |
+
font-size: 16px !important; /* 你可以調整文字大小 */
|
302 |
+
line-height: 1.7 !important;
|
303 |
+
}
|
304 |
+
|
305 |
/* 結果容器樣式 */
|
306 |
.result-container {
|
307 |
width: 100% !important;
|
|
|
390 |
box-sizing: border-box !important;
|
391 |
}
|
392 |
|
393 |
+
/* Video summary HTML 容器與內容樣式 */
|
394 |
+
#video-summary-html-output {
|
395 |
+
width: 100% !important;
|
396 |
+
box-sizing: border-box !important;
|
397 |
+
padding: 0 !important;
|
398 |
+
margin: 0 !important;
|
399 |
+
}
|
400 |
+
|
401 |
+
.video-summary-content-wrapper {
|
402 |
+
width: 100% !important;
|
403 |
+
padding: 16px !important;
|
404 |
+
line-height: 1.8 !important;
|
405 |
+
white-space: pre-wrap !important;
|
406 |
+
word-wrap: break-word !important;
|
407 |
+
border-radius: 8px !important;
|
408 |
+
min-height: 250px !important;
|
409 |
+
max-height: 600px !important;
|
410 |
+
overflow-y: auto !important;
|
411 |
+
border: 1px solid #e2e8f0 !important;
|
412 |
+
background-color: white !important;
|
413 |
+
display: block !important;
|
414 |
+
font-family: 'Arial', sans-serif !important;
|
415 |
+
font-size: 14px !important;
|
416 |
+
margin: 0 !important;
|
417 |
+
}
|
418 |
+
|
419 |
+
.video-summary-content-wrapper pre {
|
420 |
+
white-space: pre-wrap !important;
|
421 |
+
word-wrap: break-word !important;
|
422 |
+
margin: 0 !important;
|
423 |
+
padding: 0 !important;
|
424 |
+
font-family: 'Arial', sans-serif !important;
|
425 |
+
font-size: 14px !important;
|
426 |
+
line-height: 1.8 !important;
|
427 |
+
color: #2D3748 !important;
|
428 |
+
}
|
429 |
+
|
430 |
+
/* 視頻結果面板相關樣式 */
|
431 |
+
.video-result-panel {
|
432 |
+
padding: 1rem !important;
|
433 |
+
background: white !important;
|
434 |
+
border-radius: 10px !important;
|
435 |
+
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08) !important;
|
436 |
+
}
|
437 |
+
|
438 |
+
.video-output-container {
|
439 |
+
width: 100% !important;
|
440 |
+
margin-bottom: 1.5rem !important;
|
441 |
+
border-radius: 8px !important;
|
442 |
+
overflow: hidden !important;
|
443 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
|
444 |
+
}
|
445 |
+
|
446 |
+
/* 視頻統計資料顯示增強 */
|
447 |
+
.video-stats-display {
|
448 |
+
background: white !important;
|
449 |
+
border-radius: 8px !important;
|
450 |
+
padding: 1rem !important;
|
451 |
+
box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
|
452 |
+
width: 100% !important;
|
453 |
+
min-height: 200px !important;
|
454 |
+
max-height: 400px !important;
|
455 |
+
overflow-y: auto !important;
|
456 |
+
font-family: monospace !important;
|
457 |
+
box-sizing: border-box !important;
|
458 |
+
color: #2D3748 !important;
|
459 |
+
}
|
460 |
+
|
461 |
+
.custom-video-url-input {
|
462 |
+
width: 100% !important;
|
463 |
+
}
|
464 |
+
|
465 |
+
.custom-video-url-input textarea {
|
466 |
+
width: 100% !important;
|
467 |
+
min-height: 120px !important;
|
468 |
+
padding: 15px !important;
|
469 |
+
font-size: 16px !important;
|
470 |
+
line-height: 1.6 !important;
|
471 |
+
background-color: #F7FAFC !important;
|
472 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
|
473 |
+
border: 2px solid #CBD5E0 !important;
|
474 |
+
border-radius: 8px !important;
|
475 |
+
}
|
476 |
+
|
477 |
+
.custom-video-url-input textarea:focus {
|
478 |
+
border-color: #4299E1 !important;
|
479 |
+
box-shadow: 0 0 0 3px rgba(66, 153, 225, 0.2) !important;
|
480 |
+
}
|
481 |
+
|
482 |
+
/* 輸入框容器100%寬度 */
|
483 |
+
.custom-video-url-input > div {
|
484 |
+
width: 100% !important;
|
485 |
+
max-width: 100% !important;
|
486 |
+
}
|
487 |
+
|
488 |
+
/* 動畫效果, 增加互動感 */
|
489 |
+
@keyframes fadeIn {
|
490 |
+
from { opacity: 0; }
|
491 |
+
to { opacity: 1; }
|
492 |
+
}
|
493 |
+
|
494 |
+
.video-result-panel > * {
|
495 |
+
animation: fadeIn 0.5s ease-in-out;
|
496 |
+
}
|
497 |
+
|
498 |
/* 響應式調整 */
|
499 |
@media (max-width: 768px) {
|
500 |
.app-title {
|
|
|
514 |
min-height: 150px !important;
|
515 |
}
|
516 |
}
|
517 |
+
|
518 |
"""
|
519 |
return css
|
video_processor.py
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import uuid
|
5 |
+
from PIL import Image
|
6 |
+
import numpy as np
|
7 |
+
from typing import Dict, List, Tuple, Any, Optional
|
8 |
+
import time
|
9 |
+
from collections import defaultdict
|
10 |
+
|
11 |
+
from image_processor import ImageProcessor
|
12 |
+
from evaluation_metrics import EvaluationMetrics
|
13 |
+
from scene_analyzer import SceneAnalyzer
|
14 |
+
from detection_model import DetectionModel
|
15 |
+
|
16 |
+
class VideoProcessor:
|
17 |
+
"""
|
18 |
+
Handles the processing of video files, including object detection
|
19 |
+
and scene analysis on selected frames.
|
20 |
+
"""
|
21 |
+
def __init__(self, image_processor: ImageProcessor):
|
22 |
+
"""
|
23 |
+
Initializes the VideoProcessor.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
image_processor (ImageProcessor): An initialized ImageProcessor instance.
|
27 |
+
"""
|
28 |
+
self.image_processor = image_processor
|
29 |
+
|
30 |
+
def process_video_file(self,
|
31 |
+
video_path: str,
|
32 |
+
model_name: str,
|
33 |
+
confidence_threshold: float,
|
34 |
+
process_interval: int = 5,
|
35 |
+
scene_desc_interval_sec: int = 3) -> Tuple[Optional[str], str, Dict]:
|
36 |
+
"""
|
37 |
+
Processes an uploaded video file, performs detection and periodic scene analysis,
|
38 |
+
and returns the path to the annotated output video file along with a summary.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
video_path (str): Path to the input video file.
|
42 |
+
model_name (str): Name of the YOLO model to use.
|
43 |
+
confidence_threshold (float): Confidence threshold for object detection.
|
44 |
+
process_interval (int): Process every Nth frame. Defaults to 5.
|
45 |
+
scene_desc_interval_sec (int): Update scene description every N seconds. Defaults to 3.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
Tuple[Optional[str], str, Dict]: (Path to output video or None, Summary text, Statistics dictionary)
|
49 |
+
"""
|
50 |
+
if not video_path or not os.path.exists(video_path):
|
51 |
+
print(f"Error: Video file not found at {video_path}")
|
52 |
+
return None, "Error: Video file not found.", {}
|
53 |
+
|
54 |
+
print(f"Starting video processing for: {video_path}")
|
55 |
+
start_time = time.time()
|
56 |
+
|
57 |
+
cap = cv2.VideoCapture(video_path)
|
58 |
+
if not cap.isOpened():
|
59 |
+
print(f"Error: Could not open video file {video_path}")
|
60 |
+
return None, "Error opening video file.", {}
|
61 |
+
|
62 |
+
# Get video properties
|
63 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
64 |
+
if fps <= 0: # Handle case where fps is not available or invalid
|
65 |
+
fps = 30 # Assume a default fps
|
66 |
+
print(f"Warning: Could not get valid FPS for video. Assuming {fps} FPS.")
|
67 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
68 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
69 |
+
total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
70 |
+
print(f"Video properties: {width}x{height} @ {fps:.2f} FPS, Total Frames: {total_frames_video}")
|
71 |
+
|
72 |
+
# Calculate description update interval in frames
|
73 |
+
description_update_interval_frames = int(fps * scene_desc_interval_sec)
|
74 |
+
if description_update_interval_frames < 1:
|
75 |
+
description_update_interval_frames = int(fps) # Update at least once per second if interval is too short
|
76 |
+
|
77 |
+
object_trackers = {} # 儲存ID與物體的映射
|
78 |
+
last_detected_objects = {} # 儲存上一次檢測到的物體資訊
|
79 |
+
next_object_id = 0 # 下一個可用的物體ID
|
80 |
+
tracking_threshold = 0.6 # 相同物體的IoU
|
81 |
+
object_colors = {} # 每個被追蹤的物體分配固定顏色
|
82 |
+
|
83 |
+
# Setup Output Video
|
84 |
+
output_filename = f"processed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
|
85 |
+
temp_dir = tempfile.gettempdir() # Use system's temp directory
|
86 |
+
output_path = os.path.join(temp_dir, output_filename)
|
87 |
+
# Ensure the output path has a compatible extension (like .mp4)
|
88 |
+
if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
|
89 |
+
output_path += ".mp4"
|
90 |
+
|
91 |
+
# Use 'mp4v' for MP4, common and well-supported
|
92 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
93 |
+
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
|
94 |
+
if not out.isOpened():
|
95 |
+
print(f"Error: Could not open VideoWriter for path: {output_path}")
|
96 |
+
cap.release()
|
97 |
+
return None, f"Error creating output video file at {output_path}.", {}
|
98 |
+
print(f"Output video will be saved to: {output_path}")
|
99 |
+
|
100 |
+
frame_count = 0
|
101 |
+
processed_frame_count = 0
|
102 |
+
all_stats = [] # Store stats for each processed frame
|
103 |
+
summary_lines = []
|
104 |
+
last_description = "Analyzing scene..." # Initial description
|
105 |
+
frame_since_last_desc = description_update_interval_frames # Trigger analysis on first processed frame
|
106 |
+
|
107 |
+
try:
|
108 |
+
while True:
|
109 |
+
ret, frame = cap.read()
|
110 |
+
if not ret:
|
111 |
+
break # End of video
|
112 |
+
|
113 |
+
frame_count += 1
|
114 |
+
frame_since_last_desc += 1
|
115 |
+
current_frame_annotated = False # Flag if this frame was processed and annotated
|
116 |
+
|
117 |
+
# Process frame based on interval
|
118 |
+
if frame_count % process_interval == 0:
|
119 |
+
processed_frame_count += 1
|
120 |
+
print(f"Processing frame {frame_count}...")
|
121 |
+
current_frame_annotated = True
|
122 |
+
|
123 |
+
# Use ImageProcessor for single-frame tasks
|
124 |
+
# 1. Convert frame format BGR -> RGB -> PIL
|
125 |
+
try:
|
126 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
127 |
+
pil_image = Image.fromarray(frame_rgb)
|
128 |
+
except Exception as e:
|
129 |
+
print(f"Error converting frame {frame_count}: {e}")
|
130 |
+
continue # Skip this frame
|
131 |
+
|
132 |
+
# 2. Get appropriate model instance
|
133 |
+
# Confidence is passed from UI, model_name too
|
134 |
+
model_instance = self.image_processor.get_model_instance(model_name, confidence_threshold)
|
135 |
+
if not model_instance or not model_instance.is_model_loaded:
|
136 |
+
print(f"Error: Model {model_name} not loaded. Skipping frame {frame_count}.")
|
137 |
+
# Draw basic frame without annotation
|
138 |
+
cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
|
139 |
+
cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
|
140 |
+
out.write(frame)
|
141 |
+
continue
|
142 |
+
|
143 |
+
|
144 |
+
# 3. Perform detection
|
145 |
+
detection_result = model_instance.detect(pil_image) # Use PIL image
|
146 |
+
|
147 |
+
current_description_for_frame = last_description # Default to last known description
|
148 |
+
scene_analysis_result = None
|
149 |
+
stats = {}
|
150 |
+
|
151 |
+
if detection_result and hasattr(detection_result, 'boxes') and len(detection_result.boxes) > 0:
|
152 |
+
# Ensure SceneAnalyzer is ready within ImageProcessor
|
153 |
+
if not hasattr(self.image_processor, 'scene_analyzer') or self.image_processor.scene_analyzer is None:
|
154 |
+
print("Initializing SceneAnalyzer...")
|
155 |
+
# Pass class names from the current detection result
|
156 |
+
self.image_processor.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
|
157 |
+
elif self.image_processor.scene_analyzer.class_names is None:
|
158 |
+
# Update class names if they were missing
|
159 |
+
self.image_processor.scene_analyzer.class_names = detection_result.names
|
160 |
+
if hasattr(self.image_processor.scene_analyzer, 'spatial_analyzer'):
|
161 |
+
self.image_processor.scene_analyzer.spatial_analyzer.class_names = detection_result.names
|
162 |
+
|
163 |
+
|
164 |
+
# 4. Perform Scene Analysis (periodically)
|
165 |
+
if frame_since_last_desc >= description_update_interval_frames:
|
166 |
+
print(f"Analyzing scene at frame {frame_count} (threshold: {description_update_interval_frames} frames)...")
|
167 |
+
# Pass lighting_info=None for now, as it's disabled for performance
|
168 |
+
scene_analysis_result = self.image_processor.analyze_scene(detection_result, lighting_info=None)
|
169 |
+
current_description_for_frame = scene_analysis_result.get("description", last_description)
|
170 |
+
last_description = current_description_for_frame # Cache the new description
|
171 |
+
frame_since_last_desc = 0 # Reset counter
|
172 |
+
|
173 |
+
# 5. Calculate Statistics for this frame
|
174 |
+
stats = EvaluationMetrics.calculate_basic_stats(detection_result)
|
175 |
+
stats['frame_number'] = frame_count # Add frame number to stats
|
176 |
+
all_stats.append(stats)
|
177 |
+
|
178 |
+
# 6. Draw annotations
|
179 |
+
names = detection_result.names
|
180 |
+
boxes = detection_result.boxes.xyxy.cpu().numpy()
|
181 |
+
classes = detection_result.boxes.cls.cpu().numpy().astype(int)
|
182 |
+
confs = detection_result.boxes.conf.cpu().numpy()
|
183 |
+
|
184 |
+
def calculate_iou(box1, box2):
|
185 |
+
"""Calculate Intersection IOU value"""
|
186 |
+
x1_1, y1_1, x2_1, y2_1 = box1
|
187 |
+
x1_2, y1_2, x2_2, y2_2 = box2
|
188 |
+
|
189 |
+
xi1 = max(x1_1, x1_2)
|
190 |
+
yi1 = max(y1_1, y1_2)
|
191 |
+
xi2 = min(x2_1, x2_2)
|
192 |
+
yi2 = min(y2_1, y2_2)
|
193 |
+
|
194 |
+
inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
|
195 |
+
box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
|
196 |
+
box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
|
197 |
+
|
198 |
+
union_area = box1_area + box2_area - inter_area
|
199 |
+
|
200 |
+
return inter_area / union_area if union_area > 0 else 0
|
201 |
+
|
202 |
+
# 處理當前幀中的所有檢測
|
203 |
+
current_detected_objects = {}
|
204 |
+
|
205 |
+
for box, cls_id, conf in zip(boxes, classes, confs):
|
206 |
+
x1, y1, x2, y2 = map(int, box)
|
207 |
+
|
208 |
+
# 查找最匹配的已追蹤物體
|
209 |
+
best_match_id = None
|
210 |
+
best_match_iou = 0
|
211 |
+
|
212 |
+
for obj_id, (old_box, old_cls_id, _) in last_detected_objects.items():
|
213 |
+
if old_cls_id == cls_id: # 同一類別才比較
|
214 |
+
iou = calculate_iou(box, old_box)
|
215 |
+
if iou > tracking_threshold and iou > best_match_iou:
|
216 |
+
best_match_id = obj_id
|
217 |
+
best_match_iou = iou
|
218 |
+
|
219 |
+
# 如果找到匹配,使用現有ID;否則分配新ID
|
220 |
+
if best_match_id is not None:
|
221 |
+
obj_id = best_match_id
|
222 |
+
else:
|
223 |
+
obj_id = next_object_id
|
224 |
+
next_object_id += 1
|
225 |
+
# 為新物體分配固定顏色 - 使用更明顯的顏色
|
226 |
+
# 使用更明顯的顏色,避免白色
|
227 |
+
bright_colors = [
|
228 |
+
(0, 0, 255), # red
|
229 |
+
(0, 255, 0), # green
|
230 |
+
(255, 0, 0), # blue
|
231 |
+
(0, 255, 255), # yellow
|
232 |
+
(255, 0, 255), # purple
|
233 |
+
(255, 128, 0), # orange
|
234 |
+
(128, 0, 255) # purple
|
235 |
+
]
|
236 |
+
object_colors[obj_id] = bright_colors[obj_id % len(bright_colors)]
|
237 |
+
|
238 |
+
# update tracking info
|
239 |
+
current_detected_objects[obj_id] = (box, cls_id, conf)
|
240 |
+
|
241 |
+
color = object_colors.get(obj_id, (0, 255, 0)) # default is green
|
242 |
+
label = f"{names.get(cls_id, 'Unknown')}-{obj_id}: {conf:.2f}"
|
243 |
+
|
244 |
+
# 平滑化邊界框:如果是已知物體,與上一幀位置平均
|
245 |
+
if obj_id in last_detected_objects:
|
246 |
+
old_box, _, _ = last_detected_objects[obj_id]
|
247 |
+
old_x1, old_y1, old_x2, old_y2 = map(int, old_box)
|
248 |
+
# 平滑係數
|
249 |
+
alpha = 0.7 # current weight
|
250 |
+
beta = 0.3 # history weight
|
251 |
+
|
252 |
+
x1 = int(alpha * x1 + beta * old_x1)
|
253 |
+
y1 = int(alpha * y1 + beta * old_y1)
|
254 |
+
x2 = int(alpha * x2 + beta * old_x2)
|
255 |
+
y2 = int(alpha * y2 + beta * old_y2)
|
256 |
+
|
257 |
+
# draw box and label
|
258 |
+
cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
|
259 |
+
# add text
|
260 |
+
(w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
|
261 |
+
cv2.rectangle(frame, (x1, y1 - h - 10), (x1 + w, y1 - 10), color, -1)
|
262 |
+
cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
|
263 |
+
|
264 |
+
# update tracking info
|
265 |
+
last_detected_objects = current_detected_objects.copy()
|
266 |
+
|
267 |
+
|
268 |
+
# Draw the current scene description on the frame
|
269 |
+
cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA) # Black outline
|
270 |
+
cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA) # White text
|
271 |
+
|
272 |
+
# Write the frame (annotated or original) to the output video
|
273 |
+
# Draw last known description if this frame wasn't processed
|
274 |
+
if not current_frame_annotated:
|
275 |
+
cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
|
276 |
+
cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
|
277 |
+
|
278 |
+
out.write(frame) # Write frame to output file
|
279 |
+
|
280 |
+
except Exception as e:
|
281 |
+
print(f"Error during video processing loop for {video_path}: {e}")
|
282 |
+
import traceback
|
283 |
+
traceback.print_exc()
|
284 |
+
summary_lines.append(f"An error occurred during processing: {e}")
|
285 |
+
finally:
|
286 |
+
# Release resources
|
287 |
+
cap.release()
|
288 |
+
out.release()
|
289 |
+
print(f"Video processing finished. Resources released. Output path: {output_path}")
|
290 |
+
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
|
291 |
+
print(f"Error: Output video file was not created or is empty at {output_path}")
|
292 |
+
summary_lines.append("Error: Failed to create output video.")
|
293 |
+
output_path = None
|
294 |
+
|
295 |
+
end_time = time.time()
|
296 |
+
processing_time = end_time - start_time
|
297 |
+
summary_lines.insert(0, f"Finished processing in {processing_time:.2f} seconds.")
|
298 |
+
summary_lines.insert(1, f"Processed {processed_frame_count} frames out of {frame_count} (interval: {process_interval} frames).")
|
299 |
+
summary_lines.insert(2, f"Scene description updated approximately every {scene_desc_interval_sec} seconds.")
|
300 |
+
|
301 |
+
# Generate Aggregate Statistics
|
302 |
+
aggregated_stats = {
|
303 |
+
"total_frames_read": frame_count,
|
304 |
+
"total_frames_processed": processed_frame_count,
|
305 |
+
"avg_objects_per_processed_frame": 0, # Calculate below
|
306 |
+
"cumulative_detections": {}, # Total times each class was detected
|
307 |
+
"max_concurrent_detections": {} # Max count of each class in a single processed frame
|
308 |
+
}
|
309 |
+
object_cumulative_counts = {}
|
310 |
+
object_max_concurrent_counts = {} # Store the max count found for each object type
|
311 |
+
total_detected_in_processed = 0
|
312 |
+
|
313 |
+
# Iterate through stats collected from each processed frame
|
314 |
+
for frame_stats in all_stats:
|
315 |
+
total_objects_in_frame = frame_stats.get("total_objects", 0)
|
316 |
+
total_detected_in_processed += total_objects_in_frame
|
317 |
+
|
318 |
+
# Iterate through object classes detected in this frame
|
319 |
+
for obj_name, obj_data in frame_stats.get("class_statistics", {}).items():
|
320 |
+
count_in_frame = obj_data.get("count", 0)
|
321 |
+
|
322 |
+
# Cumulative count
|
323 |
+
if obj_name not in object_cumulative_counts:
|
324 |
+
object_cumulative_counts[obj_name] = 0
|
325 |
+
object_cumulative_counts[obj_name] += count_in_frame
|
326 |
+
|
327 |
+
# Max concurrent count
|
328 |
+
if obj_name not in object_max_concurrent_counts:
|
329 |
+
object_max_concurrent_counts[obj_name] = 0
|
330 |
+
# Update the max count if the current frame's count is higher
|
331 |
+
object_max_concurrent_counts[obj_name] = max(object_max_concurrent_counts[obj_name], count_in_frame)
|
332 |
+
|
333 |
+
# Add sorted results to the final dictionary
|
334 |
+
aggregated_stats["cumulative_detections"] = dict(sorted(object_cumulative_counts.items(), key=lambda item: item[1], reverse=True))
|
335 |
+
aggregated_stats["max_concurrent_detections"] = dict(sorted(object_max_concurrent_counts.items(), key=lambda item: item[1], reverse=True))
|
336 |
+
|
337 |
+
# Calculate average objects per processed frame
|
338 |
+
if processed_frame_count > 0:
|
339 |
+
aggregated_stats["avg_objects_per_processed_frame"] = round(total_detected_in_processed / processed_frame_count, 2)
|
340 |
+
|
341 |
+
summary_text = "\n".join(summary_lines)
|
342 |
+
print("Generated Summary:\n", summary_text)
|
343 |
+
print("Aggregated Stats (Revised):\n", aggregated_stats) # Print the revised stats
|
344 |
+
|
345 |
+
# Return the potentially updated output_path
|
346 |
+
return output_path, summary_text, aggregated_stats
|