DawnC commited on
Commit
3172319
·
verified ·
1 Parent(s): de894d3

Upload 27 files

Browse files
.gitattributes CHANGED
@@ -37,3 +37,5 @@ room_01.jpg filter=lfs diff=lfs merge=lfs -text
37
  street_01.jpg filter=lfs diff=lfs merge=lfs -text
38
  street_02.jpg filter=lfs diff=lfs merge=lfs -text
39
  street_03.jpg filter=lfs diff=lfs merge=lfs -text
 
 
 
37
  street_01.jpg filter=lfs diff=lfs merge=lfs -text
38
  street_02.jpg filter=lfs diff=lfs merge=lfs -text
39
  street_03.jpg filter=lfs diff=lfs merge=lfs -text
40
+ room_02.jpg filter=lfs diff=lfs merge=lfs -text
41
+ street_04.jpg filter=lfs diff=lfs merge=lfs -text
activity_templates.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ACTIVITY_TEMPLATES = {
3
+ "living_room": [
4
+ "Watching TV",
5
+ "Relaxing on the sofa",
6
+ "Reading",
7
+ "Socializing"
8
+ ],
9
+ "bedroom": [
10
+ "Sleeping",
11
+ "Resting",
12
+ "Getting dressed",
13
+ "Reading in bed"
14
+ ],
15
+ "dining_area": [
16
+ "Eating a meal",
17
+ "Having a conversation",
18
+ "Working at table"
19
+ ],
20
+ "kitchen": [
21
+ "Cooking",
22
+ "Food preparation",
23
+ "Cleaning dishes"
24
+ ],
25
+ "office_workspace": [
26
+ "Working on computer",
27
+ "Office work",
28
+ "Virtual meetings",
29
+ "Reading documents"
30
+ ],
31
+ "meeting_room": [
32
+ "Group meeting",
33
+ "Presentation",
34
+ "Team discussion",
35
+ "Collaboration"
36
+ ],
37
+ "city_street": [
38
+ "Walking",
39
+ "Commuting",
40
+ "Shopping",
41
+ "Waiting for transportation"
42
+ ],
43
+ "parking_lot": [
44
+ "Parking vehicles",
45
+ "Loading/unloading items",
46
+ "Entering/exiting vehicles"
47
+ ],
48
+ "park_area": [
49
+ "Walking",
50
+ "Relaxing outdoors",
51
+ "Exercising",
52
+ "Social gathering"
53
+ ],
54
+ "retail_store": [
55
+ "Shopping",
56
+ "Browsing products",
57
+ "Purchasing items"
58
+ ],
59
+ "supermarket": [
60
+ "Grocery shopping",
61
+ "Selecting products",
62
+ "Checking out"
63
+ ],
64
+ "upscale_dining": [
65
+ "Fine dining",
66
+ "Social gathering",
67
+ "Special occasion meal",
68
+ "Family dinner",
69
+ "Business meeting",
70
+ "Celebratory meal"
71
+ ],
72
+ "asian_commercial_street": [
73
+ "Shopping",
74
+ "Sightseeing",
75
+ "Walking to destinations",
76
+ "Visiting local shops",
77
+ "Cultural exploration",
78
+ "Urban commuting",
79
+ "Meeting friends"
80
+ ],
81
+ "financial_district": [
82
+ "Commuting",
83
+ "Business travel",
84
+ "Urban transit",
85
+ "Sightseeing",
86
+ "City navigation",
87
+ "Professional activities",
88
+ "Corporate meetings"
89
+ ],
90
+ "urban_intersection": [
91
+ "Street crossing",
92
+ "Waiting for signals",
93
+ "Urban navigation",
94
+ "Commuting",
95
+ "Group movement",
96
+ "Following traffic patterns",
97
+ "Pedestrian coordination"
98
+ ],
99
+ "transit_hub": [
100
+ "Commuting",
101
+ "Waiting for transportation",
102
+ "Transferring between vehicles",
103
+ "Starting/ending journeys",
104
+ "Meeting travelers",
105
+ "Checking transit schedules",
106
+ "Urban transportation"
107
+ ],
108
+ "shopping_district": [
109
+ "Retail shopping",
110
+ "Window browsing",
111
+ "Social shopping",
112
+ "Product comparison",
113
+ "Making purchases",
114
+ "Brand exploration",
115
+ "Recreational shopping"
116
+ ],
117
+ "bus_stop": [
118
+ "Waiting for the bus",
119
+ "Checking schedules",
120
+ "Boarding or alighting",
121
+ "Standing under shelter"
122
+ ],
123
+ "bus_station": [
124
+ "Navigating between platforms",
125
+ "Handling luggage",
126
+ "Boarding buses",
127
+ "Gathering at waiting areas"
128
+ ],
129
+ "zoo": [
130
+ "Watching animal exhibits",
131
+ "Taking photos of wildlife",
132
+ "Walking along enclosures",
133
+ "Reading informational signs"
134
+ ],
135
+ "harbor": [
136
+ "Observing docked boats",
137
+ "Commuting by watercraft",
138
+ "Loading or unloading cargo",
139
+ "Strolling along the pier"
140
+ ],
141
+ "playground": [
142
+ "Playing ball games",
143
+ "Swinging or sliding",
144
+ "Running around",
145
+ "Socializing with friends"
146
+ ],
147
+ "sports_field": [
148
+ "Practicing ball drills",
149
+ "Competing in matches",
150
+ "Warming up or stretching",
151
+ "Team training sessions"
152
+ ],
153
+ "narrow_commercial_alley": [
154
+ "Walking through alley",
155
+ "Browsing storefronts",
156
+ "Navigating light traffic",
157
+ "Carrying shopping bags"
158
+ ],
159
+ "daytime_shopping_street": [
160
+ "Shopping",
161
+ "Window browsing",
162
+ "Street photography",
163
+ "Commuting by vehicle"
164
+ ],
165
+ "urban_pedestrian_crossing": [
166
+ "Crossing the street",
167
+ "Waiting for signal",
168
+ "Following traffic rules",
169
+ "Checking for vehicles"
170
+ ],
171
+ "aerial_view_intersection": [
172
+ "Crossing multiple directions",
173
+ "Following traffic signals",
174
+ "Navigating pedestrian paths",
175
+ "Traffic management",
176
+ "Multi-directional movement",
177
+ "Organized crossing patterns",
178
+ "Waiting at signals"
179
+ ],
180
+ "aerial_view_commercial_area": [
181
+ "Shopping district navigation",
182
+ "Retail browsing",
183
+ "Store-to-store movement",
184
+ "Commercial zone foot traffic",
185
+ "Shopping center traversal",
186
+ "Retail area engagement",
187
+ "Walking between stores"
188
+ ],
189
+ "aerial_view_plaza": [
190
+ "Public gathering",
191
+ "Open space traversal",
192
+ "Community congregation",
193
+ "Plaza navigation",
194
+ "Public square activities",
195
+ "Urban space utilization"
196
+ ],
197
+ "asian_night_market": [
198
+ "Street food sampling",
199
+ "Night market browsing",
200
+ "Evening shopping",
201
+ "Cultural food exploration",
202
+ "Vendor interaction",
203
+ "Social night dining",
204
+ "Market stall hopping"
205
+ ],
206
+ "asian_temple_area": [
207
+ "Temple visiting",
208
+ "Cultural site exploration",
209
+ "Spiritual observance",
210
+ "Traditional rituals",
211
+ "Historical site appreciation",
212
+ "Religious tourism",
213
+ "Cultural photography"
214
+ ],
215
+ "european_plaza": [
216
+ "Urban sightseeing",
217
+ "Historical appreciation",
218
+ "Tourist photography",
219
+ "Public space relaxation",
220
+ "Casual strolling"
221
+ ],
222
+ "nighttime_street": [
223
+ "Evening commuting",
224
+ "Night walking",
225
+ "After-hours travel",
226
+ "Nighttime navigation",
227
+ "Evening errands",
228
+ "Late-night transportation",
229
+ "Nocturnal urban movement"
230
+ ],
231
+ "nighttime_commercial_district": [
232
+ "Evening shopping",
233
+ "Nightlife participation",
234
+ "Nighttime entertainment",
235
+ "After-dark dining",
236
+ "Evening social gathering",
237
+ "Night market browsing",
238
+ "Illumination appreciation"
239
+ ],
240
+ "indoor_outdoor_cafe": [
241
+ "Al fresco dining",
242
+ "Sidewalk coffee enjoyment",
243
+ "Indoor-outdoor socializing",
244
+ "Patio relaxation",
245
+ "Open-air refreshment",
246
+ "Transitional space usage",
247
+ "Weather-dependent positioning"
248
+ ],
249
+ "transit_station_platform": [
250
+ "Transit waiting",
251
+ "Platform navigation",
252
+ "Boarding preparation",
253
+ "Arrival monitoring",
254
+ "Schedule checking",
255
+ "Departure positioning",
256
+ "Platform traversal"
257
+ ],
258
+ "sports_stadium": [
259
+ "Spectator viewing",
260
+ "Sports fan cheering",
261
+ "Game attendance",
262
+ "Stadium navigation",
263
+ "Athletic event watching",
264
+ "Audience participation",
265
+ "Sports appreciation"
266
+ ],
267
+ "construction_site": [
268
+ "Construction work",
269
+ "Building development",
270
+ "Site management",
271
+ "Material handling",
272
+ "Construction supervision",
273
+ "Safety monitoring",
274
+ "Building process"
275
+ ],
276
+ "medical_facility": [
277
+ "Healthcare consultation",
278
+ "Medical treatment",
279
+ "Patient waiting",
280
+ "Healthcare delivery",
281
+ "Medical examination",
282
+ "Professional care",
283
+ "Health monitoring"
284
+ ],
285
+ "educational_setting": [
286
+ "Classroom learning",
287
+ "Educational instruction",
288
+ "Student participation",
289
+ "Academic engagement",
290
+ "Knowledge acquisition",
291
+ "Educational discussion",
292
+ "Scholastic activities"
293
+ ],
294
+ "beach_water_recreation": [
295
+ "Surfing",
296
+ "Sunbathing",
297
+ "Beach volleyball",
298
+ "Swimming",
299
+ "Relaxing by the water",
300
+ "Flying beach kites",
301
+ "Beach picnicking",
302
+ "Coastal walking"
303
+ ],
304
+ "sports_venue": [
305
+ "Professional game playing",
306
+ "Sports competition",
307
+ "Athletic training",
308
+ "Team practice",
309
+ "Spectator viewing",
310
+ "Sports coaching",
311
+ "Tournament participation",
312
+ "Athletic performance"
313
+ ],
314
+ "professional_kitchen": [
315
+ "Professional cooking",
316
+ "Food preparation",
317
+ "Meal service coordination",
318
+ "Kitchen operations",
319
+ "Culinary production",
320
+ "Chef activities",
321
+ "Commercial food handling",
322
+ "Restaurant meal preparation"
323
+ ]
324
+ }
app.py CHANGED
@@ -63,48 +63,102 @@ def process_and_plot(image, model_name, confidence_threshold, filter_classes=Non
63
  filter_classes: Optional list of classes to filter results
64
 
65
  Returns:
66
- Tuple of (result_image, result_text, formatted_stats, plot_figure)
67
  """
68
- class_ids = None
69
- if filter_classes:
70
- class_ids = []
71
- for class_str in filter_classes:
72
- try:
73
- # Extract ID from format "id: name"
74
- class_id = int(class_str.split(":")[0].strip())
75
- class_ids.append(class_id)
76
- except:
77
- continue
78
-
79
- # Execute detection
80
- result_image, result_text, stats = image_processor.process_image(
81
- image,
82
- model_name,
83
- confidence_threshold,
84
- class_ids
85
- )
86
-
87
- # Format the statistics for better display
88
- formatted_stats = image_processor.format_json_for_display(stats)
89
-
90
- if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
91
- # Create the table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  fig, ax = plt.subplots(figsize=(8, 6))
93
- ax.text(0.5, 0.5, "No detection data available",
94
- ha='center', va='center', fontsize=14, fontfamily='Arial')
95
  ax.set_xlim(0, 1)
96
  ax.set_ylim(0, 1)
97
  ax.axis('off')
98
- plot_figure = fig
99
- else:
100
- # Prepare visualization data
101
- available_classes = dict(get_all_classes())
102
- viz_data = image_processor.prepare_visualization_data(stats, available_classes)
103
-
104
- # Create plot
105
- plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
106
 
107
- return result_image, result_text, formatted_stats, plot_figure
 
108
 
109
  def create_interface():
110
  """創建 Gradio 界面,包含美化的視覺效果"""
@@ -121,19 +175,43 @@ def create_interface():
121
 
122
  # 創建 Gradio Blocks 界面
123
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
124
- # 頁面頂部標題
125
  with gr.Group(elem_classes="app-header"):
126
- gr.HTML("""
127
- <div style="text-align: center; width: 100%;">
128
- <h1 class="app-title">VisionScout</h1>
129
- <h2 class="app-subtitle">Detect and identify objects in your images</h2>
130
- <div class="app-divider"></div>
131
- </div>
132
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  current_model = gr.State("yolov8m.pt") # use medium size model as defualt
135
 
136
- # 主要內容區
137
  with gr.Row(equal_height=True):
138
  # 左側 - 輸入控制區(可上傳圖片)
139
  with gr.Column(scale=4, elem_classes="input-panel"):
@@ -208,8 +286,8 @@ def create_interface():
208
  # 文本框設置,讓顯示會更寬
209
  result_text = gr.Textbox(
210
  label=None,
211
- lines=12,
212
- max_lines=15,
213
  elem_classes="wide-result-text",
214
  elem_id="detection-details",
215
  container=False,
@@ -217,6 +295,57 @@ def create_interface():
217
  min_width=600
218
  )
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  with gr.Tab("Statistics"):
221
  with gr.Row():
222
  with gr.Column(scale=3, elem_classes="plot-column"):
@@ -235,10 +364,14 @@ def create_interface():
235
  )
236
 
237
  detect_btn.click(
238
- fn=process_and_plot,
239
- inputs=[image_input, current_model, confidence, class_filter],
240
- outputs=[result_image, result_text, stats_json, plot_output]
241
- )
 
 
 
 
242
 
243
  # model option
244
  model_dropdown.change(
@@ -276,9 +409,9 @@ def create_interface():
276
 
277
  example_images = [
278
  "room_01.jpg",
279
- "street_01.jpg",
280
  "street_02.jpg",
281
- "street_03.jpg"
282
  ]
283
 
284
  # add example images
 
63
  filter_classes: Optional list of classes to filter results
64
 
65
  Returns:
66
+ Tuple of results including lighting conditions
67
  """
68
+ try:
69
+ class_ids = None
70
+ if filter_classes:
71
+ class_ids = []
72
+ for class_str in filter_classes:
73
+ try:
74
+ # Extract ID from format "id: name"
75
+ class_id = int(class_str.split(":")[0].strip())
76
+ class_ids.append(class_id)
77
+ except:
78
+ continue
79
+
80
+ # Execute detection
81
+ result_image, result_text, stats = image_processor.process_image(
82
+ image,
83
+ model_name,
84
+ confidence_threshold,
85
+ class_ids
86
+ )
87
+
88
+ # Format the statistics for better display
89
+ formatted_stats = image_processor.format_json_for_display(stats)
90
+
91
+ if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
92
+ # Create the table
93
+ fig, ax = plt.subplots(figsize=(8, 6))
94
+ ax.text(0.5, 0.5, "No detection data available",
95
+ ha='center', va='center', fontsize=14, fontfamily='Arial')
96
+ ax.set_xlim(0, 1)
97
+ ax.set_ylim(0, 1)
98
+ ax.axis('off')
99
+ plot_figure = fig
100
+ else:
101
+ # Prepare visualization data
102
+ available_classes = dict(get_all_classes())
103
+ viz_data = image_processor.prepare_visualization_data(stats, available_classes)
104
+
105
+ # Create plot
106
+ plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
107
+
108
+ # Extract scene analysis info
109
+ scene_analysis = stats.get("scene_analysis", {})
110
+
111
+ scene_desc = scene_analysis.get("description", "No scene analysis available.")
112
+ scene_desc = scene_desc.strip()
113
+
114
+ # HTML format
115
+ scene_desc_html = f"""
116
+ <div id='scene-desc-container' style='width:100%; padding:20px; text-align:center; background-color:#f5f9fc; border-radius:8px; margin:10px auto; min-height:200px; max-height:none; overflow-y:auto;'>
117
+ <div style='width:100%; text-align:center; margin:0 auto; font-family:Arial, sans-serif; font-size:14px; line-height:1.8;'>
118
+ {scene_desc}
119
+ </div>
120
+ </div>
121
+ """
122
+
123
+ # Extract lighting conditions
124
+ lighting_conditions = scene_analysis.get("lighting_conditions",
125
+ {"time_of_day": "unknown", "confidence": 0.0})
126
+
127
+ # 準備活動列表
128
+ activities = scene_analysis.get("possible_activities", [])
129
+ if not activities:
130
+ activities_data = [["No activities detected"]]
131
+ else:
132
+ activities_data = [[activity] for activity in activities]
133
+
134
+ # 準備安全注意事項列表
135
+ safety_concerns = scene_analysis.get("safety_concerns", [])
136
+ if not safety_concerns:
137
+ safety_data = [["No safety concerns detected"]]
138
+ else:
139
+ safety_data = [[concern] for concern in safety_concerns]
140
+
141
+ # 功能區域
142
+ zones = scene_analysis.get("functional_zones", {})
143
+
144
+ return result_image, result_text, formatted_stats, plot_figure, scene_desc, activities_data, safety_data, zones, lighting_conditions
145
+
146
+ except Exception as e:
147
+ # 添加錯誤處理,確保即使出錯也能返回有效的數據
148
+ import traceback
149
+ error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
150
+ print(error_msg)
151
+
152
+ # 創建一個簡單的錯誤圖
153
  fig, ax = plt.subplots(figsize=(8, 6))
154
+ ax.text(0.5, 0.5, f"Error: {str(e)}",
155
+ ha='center', va='center', fontsize=14, fontfamily='Arial', color='red')
156
  ax.set_xlim(0, 1)
157
  ax.set_ylim(0, 1)
158
  ax.axis('off')
 
 
 
 
 
 
 
 
159
 
160
+ # 返回有效的默認值
161
+ return None, error_msg, "{}", fig, "Error processing image", [["No activities"]], [["No safety concerns"]], {}, {"time_of_day": "unknown", "confidence": 0}
162
 
163
  def create_interface():
164
  """創建 Gradio 界面,包含美化的視覺效果"""
 
175
 
176
  # 創建 Gradio Blocks 界面
177
  with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
178
+ # 主頁頂部的標題
179
  with gr.Group(elem_classes="app-header"):
180
+ gr.HTML("""
181
+ <div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
182
+ <h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
183
+
184
+ <h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Detect and identify objects in your images</h2>
185
+
186
+ <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;">
187
+ <div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div>
188
+ </div>
189
+
190
+ <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
191
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
192
+ <span style="margin-right: 6px;">🔍</span> Object Detection
193
+ </div>
194
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
195
+ <span style="margin-right: 6px;">🌐</span> Scene Understanding
196
+ </div>
197
+ <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
198
+ <span style="margin-right: 6px;">📊</span> Visual Analysis
199
+ </div>
200
+ </div>
201
+
202
+ <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
203
+ <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
204
+ <span style="margin-right: 5px;">📱</span> iPhone users: HEIC images are not supported.
205
+ <a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG here</a> before uploading.
206
+ </p>
207
+ </div>
208
+ </div>
209
+ """)
210
+
211
 
212
  current_model = gr.State("yolov8m.pt") # use medium size model as defualt
213
 
214
+ # 主要內容區
215
  with gr.Row(equal_height=True):
216
  # 左側 - 輸入控制區(可上傳圖片)
217
  with gr.Column(scale=4, elem_classes="input-panel"):
 
286
  # 文本框設置,讓顯示會更寬
287
  result_text = gr.Textbox(
288
  label=None,
289
+ lines=15,
290
+ max_lines=20,
291
  elem_classes="wide-result-text",
292
  elem_id="detection-details",
293
  container=False,
 
295
  min_width=600
296
  )
297
 
298
+ # Scene Analysis
299
+ with gr.Tab("Scene Understanding", elem_classes="scene-understanding-tab"):
300
+ with gr.Group(elem_classes="result-details-box"):
301
+ gr.HTML("""
302
+ <div class="section-heading">Scene Analysis</div>
303
+ <details class="info-details" style="margin: 5px 0 15px 0;">
304
+ <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
305
+ 🔍 The AI Vision Scout Report: Click for important notes about this analysis
306
+ </summary>
307
+ <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
308
+ <p style="font-size: 13px; color: #718096; margin: 0;">
309
+ <b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
310
+ Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
311
+ Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
312
+ </p>
313
+ </div>
314
+ </details>
315
+ """)
316
+
317
+ # 使用更適合長文本的容器
318
+ with gr.Group(elem_classes="scene-description-container"):
319
+ scene_description = gr.HTML(
320
+ value="<div id='scene-desc-container'></div>",
321
+ label="Scene Description"
322
+ )
323
+
324
+ with gr.Row():
325
+ with gr.Column(scale=2):
326
+ activities_list = gr.Dataframe(
327
+ headers=["Activities"],
328
+ datatype=["str"],
329
+ col_count=1,
330
+ row_count=5,
331
+ elem_classes="full-width-element"
332
+ )
333
+
334
+ with gr.Column(scale=2):
335
+ safety_list = gr.Dataframe(
336
+ headers=["Safety Concerns"],
337
+ datatype=["str"],
338
+ col_count=1,
339
+ row_count=5,
340
+ elem_classes="full-width-element"
341
+ )
342
+
343
+ gr.HTML('<div class="section-heading">Functional Zones</div>')
344
+ zones_json = gr.JSON(label=None, elem_classes="json-box")
345
+
346
+ gr.HTML('<div class="section-heading">Lighting Conditions</div>')
347
+ lighting_info = gr.JSON(label=None, elem_classes="json-box")
348
+
349
  with gr.Tab("Statistics"):
350
  with gr.Row():
351
  with gr.Column(scale=3, elem_classes="plot-column"):
 
364
  )
365
 
366
  detect_btn.click(
367
+ fn=process_and_plot,
368
+ inputs=[image_input, current_model, confidence, class_filter],
369
+ outputs=[
370
+ result_image, result_text, stats_json, plot_output,
371
+ scene_description, activities_list, safety_list, zones_json,
372
+ lighting_info
373
+ ]
374
+ )
375
 
376
  # model option
377
  model_dropdown.change(
 
409
 
410
  example_images = [
411
  "room_01.jpg",
412
+ "room_02.jpg",
413
  "street_02.jpg",
414
+ "street_04.jpg"
415
  ]
416
 
417
  # add example images
clip_analyzer.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import clip
3
+ import numpy as np
4
+ from PIL import Image
5
+ from typing import Dict, List, Tuple, Any, Optional, Union
6
+ from clip_prompts import (
7
+ SCENE_TYPE_PROMPTS,
8
+ CULTURAL_SCENE_PROMPTS,
9
+ COMPARATIVE_PROMPTS,
10
+ LIGHTING_CONDITION_PROMPTS,
11
+ SPECIALIZED_SCENE_PROMPTS,
12
+ VIEWPOINT_PROMPTS,
13
+ OBJECT_COMBINATION_PROMPTS,
14
+ ACTIVITY_PROMPTS
15
+ )
16
+
17
+ class CLIPAnalyzer:
18
+ """
19
+ Use Clip to intergrate scene understanding function
20
+ """
21
+
22
+ def __init__(self, model_name: str = "ViT-B/32", device: str = None):
23
+ """
24
+ 初始化 CLIP 分析器。
25
+
26
+ Args:
27
+ model_name: CLIP Model name, "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
28
+ device: Use GPU if it can use
29
+ """
30
+ # 自動選擇設備
31
+ if device is None:
32
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ else:
34
+ self.device = device
35
+
36
+ print(f"Loading CLIP model {model_name} on {self.device}...")
37
+ try:
38
+ self.model, self.preprocess = clip.load(model_name, device=self.device)
39
+ print(f"CLIP model loaded successfully.")
40
+ except Exception as e:
41
+ print(f"Error loading CLIP model: {e}")
42
+ raise
43
+
44
+ self.scene_type_prompts = SCENE_TYPE_PROMPTS
45
+ self.cultural_scene_prompts = CULTURAL_SCENE_PROMPTS
46
+ self.comparative_prompts = COMPARATIVE_PROMPTS
47
+ self.lighting_condition_prompts = LIGHTING_CONDITION_PROMPTS
48
+ self.specialized_scene_prompts = SPECIALIZED_SCENE_PROMPTS
49
+ self.viewpoint_prompts = VIEWPOINT_PROMPTS
50
+ self.object_combination_prompts = OBJECT_COMBINATION_PROMPTS
51
+ self.activity_prompts = ACTIVITY_PROMPTS
52
+
53
+ # turn to CLIP format
54
+ self._prepare_text_prompts()
55
+
56
+ def _prepare_text_prompts(self):
57
+ """準備所有文本提示的 CLIP 特徵"""
58
+ # base prompt
59
+ scene_texts = [self.scene_type_prompts[scene_type] for scene_type in self.scene_type_prompts]
60
+ self.scene_type_tokens = clip.tokenize(scene_texts).to(self.device)
61
+
62
+ # cultural
63
+ self.cultural_tokens_dict = {}
64
+ for scene_type, prompts in self.cultural_scene_prompts.items():
65
+ self.cultural_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
66
+
67
+ # Light
68
+ lighting_texts = [self.lighting_condition_prompts[cond] for cond in self.lighting_condition_prompts]
69
+ self.lighting_tokens = clip.tokenize(lighting_texts).to(self.device)
70
+
71
+ # specializes_status
72
+ self.specialized_tokens_dict = {}
73
+ for scene_type, prompts in self.specialized_scene_prompts.items():
74
+ self.specialized_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
75
+
76
+ # view point
77
+ viewpoint_texts = [self.viewpoint_prompts[viewpoint] for viewpoint in self.viewpoint_prompts]
78
+ self.viewpoint_tokens = clip.tokenize(viewpoint_texts).to(self.device)
79
+
80
+ # object combination
81
+ object_combination_texts = [self.object_combination_prompts[combo] for combo in self.object_combination_prompts]
82
+ self.object_combination_tokens = clip.tokenize(object_combination_texts).to(self.device)
83
+
84
+ # activicty prompt
85
+ activity_texts = [self.activity_prompts[activity] for activity in self.activity_prompts]
86
+ self.activity_tokens = clip.tokenize(activity_texts).to(self.device)
87
+
88
+ def analyze_image(self, image, include_cultural_analysis: bool = True) -> Dict[str, Any]:
89
+ """
90
+ 分析圖像,預測場景類型和光照條件。
91
+
92
+ Args:
93
+ image: 輸入圖像 (PIL Image 或 numpy array)
94
+ include_cultural_analysis: 是否包含文化場景的詳細分析
95
+
96
+ Returns:
97
+ Dict: 包含場景類型預測和光照條件的分析結果
98
+ """
99
+ try:
100
+ # 確保圖像是 PIL 格式
101
+ if not isinstance(image, Image.Image):
102
+ if isinstance(image, np.ndarray):
103
+ image = Image.fromarray(image)
104
+ else:
105
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
106
+
107
+ # 預處理圖像
108
+ image_input = self.preprocess(image).unsqueeze(0).to(self.device)
109
+
110
+ # 獲取圖像特徵
111
+ with torch.no_grad():
112
+ image_features = self.model.encode_image(image_input)
113
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
114
+
115
+ # 分析場景類型
116
+ scene_scores = self._analyze_scene_type(image_features)
117
+
118
+ # 分析光照條件
119
+ lighting_scores = self._analyze_lighting_condition(image_features)
120
+
121
+ # 文化場景的增強分析
122
+ cultural_analysis = {}
123
+ if include_cultural_analysis:
124
+ for scene_type in self.cultural_scene_prompts:
125
+ if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
126
+ cultural_analysis[scene_type] = self._analyze_cultural_scene(
127
+ image_features, scene_type
128
+ )
129
+
130
+ specialized_analysis = {}
131
+ for scene_type in self.specialized_scene_prompts:
132
+ if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
133
+ specialized_analysis[scene_type] = self._analyze_specialized_scene(
134
+ image_features, scene_type
135
+ )
136
+
137
+ viewpoint_scores = self._analyze_viewpoint(image_features)
138
+
139
+ object_combination_scores = self._analyze_object_combinations(image_features)
140
+
141
+ activity_scores = self._analyze_activities(image_features)
142
+
143
+ # display results
144
+ result = {
145
+ "scene_scores": scene_scores,
146
+ "top_scene": max(scene_scores.items(), key=lambda x: x[1]),
147
+ "lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]),
148
+ "embedding": image_features.cpu().numpy().tolist()[0] if self.device == "cuda" else image_features.numpy().tolist()[0],
149
+ "viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]),
150
+ "object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3],
151
+ "activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3]
152
+ }
153
+
154
+ if cultural_analysis:
155
+ result["cultural_analysis"] = cultural_analysis
156
+
157
+ if specialized_analysis:
158
+ result["specialized_analysis"] = specialized_analysis
159
+
160
+ return result
161
+
162
+ except Exception as e:
163
+ print(f"Error analyzing image with CLIP: {e}")
164
+ import traceback
165
+ traceback.print_exc()
166
+ return {"error": str(e)}
167
+
168
+ def _analyze_scene_type(self, image_features: torch.Tensor) -> Dict[str, float]:
169
+ """分析圖像特徵與各場景類型的相似度"""
170
+ with torch.no_grad():
171
+ # 計算場景類型文本特徵
172
+ text_features = self.model.encode_text(self.scene_type_tokens)
173
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
174
+
175
+ # 計算相似度分數
176
+ similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
177
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
178
+
179
+ # 建立場景分數字典
180
+ scene_scores = {}
181
+ for i, scene_type in enumerate(self.scene_type_prompts.keys()):
182
+ scene_scores[scene_type] = float(similarity[i])
183
+
184
+ return scene_scores
185
+
186
+ def _analyze_lighting_condition(self, image_features: torch.Tensor) -> Dict[str, float]:
187
+ """分析圖像的光照條件"""
188
+ with torch.no_grad():
189
+ # 計算光照條件文本特徵
190
+ text_features = self.model.encode_text(self.lighting_tokens)
191
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
192
+
193
+ # 計算相似度分數
194
+ similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
195
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
196
+
197
+ # 建立光照條件分數字典
198
+ lighting_scores = {}
199
+ for i, lighting_type in enumerate(self.lighting_condition_prompts.keys()):
200
+ lighting_scores[lighting_type] = float(similarity[i])
201
+
202
+ return lighting_scores
203
+
204
+ def _analyze_cultural_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
205
+ """針對特定文化場景進行深入分析"""
206
+ if scene_type not in self.cultural_tokens_dict:
207
+ return {"error": f"No cultural analysis available for {scene_type}"}
208
+
209
+ with torch.no_grad():
210
+ # 獲取特定文化場景的文本特徵
211
+ cultural_tokens = self.cultural_tokens_dict[scene_type]
212
+ text_features = self.model.encode_text(cultural_tokens)
213
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
214
+
215
+ # 計算相似度分數
216
+ similarity = (100 * image_features @ text_features.T)
217
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
218
+
219
+ # 找到最匹配的文化描述
220
+ prompts = self.cultural_scene_prompts[scene_type]
221
+ scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
222
+ scores.sort(key=lambda x: x[1], reverse=True)
223
+
224
+ return {
225
+ "best_description": scores[0][0],
226
+ "confidence": scores[0][1],
227
+ "all_matches": scores
228
+ }
229
+
230
+ def _analyze_specialized_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
231
+ """針對特定專門場景進行深入分析"""
232
+ if scene_type not in self.specialized_tokens_dict:
233
+ return {"error": f"No specialized analysis available for {scene_type}"}
234
+
235
+ with torch.no_grad():
236
+ # 獲取特定專門場景的文本特徵
237
+ specialized_tokens = self.specialized_tokens_dict[scene_type]
238
+ text_features = self.model.encode_text(specialized_tokens)
239
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
240
+
241
+ # 計算相似度分數
242
+ similarity = (100 * image_features @ text_features.T)
243
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
244
+
245
+ # 找到最匹配的專門描述
246
+ prompts = self.specialized_scene_prompts[scene_type]
247
+ scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
248
+ scores.sort(key=lambda x: x[1], reverse=True)
249
+
250
+ return {
251
+ "best_description": scores[0][0],
252
+ "confidence": scores[0][1],
253
+ "all_matches": scores
254
+ }
255
+
256
+ def _analyze_viewpoint(self, image_features: torch.Tensor) -> Dict[str, float]:
257
+ """分析圖像的拍攝視角"""
258
+ with torch.no_grad():
259
+ # 計算視角文本特徵
260
+ text_features = self.model.encode_text(self.viewpoint_tokens)
261
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
262
+
263
+ # 計算相似度分數
264
+ similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
265
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
266
+
267
+ # 建立視角分數字典
268
+ viewpoint_scores = {}
269
+ for i, viewpoint in enumerate(self.viewpoint_prompts.keys()):
270
+ viewpoint_scores[viewpoint] = float(similarity[i])
271
+
272
+ return viewpoint_scores
273
+
274
+ def _analyze_object_combinations(self, image_features: torch.Tensor) -> Dict[str, float]:
275
+ """分析圖像中的物體組合"""
276
+ with torch.no_grad():
277
+ # 計算物體組合文本特徵
278
+ text_features = self.model.encode_text(self.object_combination_tokens)
279
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
280
+
281
+ # 計算相似度分數
282
+ similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
283
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
284
+
285
+ # 建立物體組合分數字典
286
+ combination_scores = {}
287
+ for i, combination in enumerate(self.object_combination_prompts.keys()):
288
+ combination_scores[combination] = float(similarity[i])
289
+
290
+ return combination_scores
291
+
292
+ def _analyze_activities(self, image_features: torch.Tensor) -> Dict[str, float]:
293
+ """分析圖像中的活動"""
294
+ with torch.no_grad():
295
+ # 計算活動文本特徵
296
+ text_features = self.model.encode_text(self.activity_tokens)
297
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
298
+
299
+ # 計算相似度分數
300
+ similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
301
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
302
+
303
+ # 建立活動分數字典
304
+ activity_scores = {}
305
+ for i, activity in enumerate(self.activity_prompts.keys()):
306
+ activity_scores[activity] = float(similarity[i])
307
+
308
+ return activity_scores
309
+
310
+ def get_image_embedding(self, image) -> np.ndarray:
311
+ """
312
+ 獲取圖像的 CLIP 嵌入表示
313
+
314
+ Args:
315
+ image: PIL Image 或 numpy array
316
+
317
+ Returns:
318
+ np.ndarray: 圖像的 CLIP 特徵向量
319
+ """
320
+ # 確保圖像是 PIL 格式
321
+ if not isinstance(image, Image.Image):
322
+ if isinstance(image, np.ndarray):
323
+ image = Image.fromarray(image)
324
+ else:
325
+ raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
326
+
327
+ # 預處理並編碼
328
+ image_input = self.preprocess(image).unsqueeze(0).to(self.device)
329
+
330
+ with torch.no_grad():
331
+ image_features = self.model.encode_image(image_input)
332
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
333
+
334
+ # 轉換為 numpy 並返回
335
+ return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]
336
+
337
+ def text_to_embedding(self, text: str) -> np.ndarray:
338
+ """
339
+ 將文本轉換為 CLIP 嵌入表示
340
+
341
+ Args:
342
+ text: 輸入文本
343
+
344
+ Returns:
345
+ np.ndarray: 文本的 CLIP 特徵向量
346
+ """
347
+ text_token = clip.tokenize([text]).to(self.device)
348
+
349
+ with torch.no_grad():
350
+ text_features = self.model.encode_text(text_token)
351
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
352
+
353
+ return text_features.cpu().numpy()[0] if self.device == "cuda" else text_features.numpy()[0]
354
+
355
+ def calculate_similarity(self, image, text_queries: List[str]) -> Dict[str, float]:
356
+ """
357
+ 計算圖像與多個文本查詢的相似度
358
+
359
+ Args:
360
+ image: PIL Image 或 numpy array
361
+ text_queries: 文本查詢列表
362
+
363
+ Returns:
364
+ Dict: 每個查詢的相似度分數
365
+ """
366
+ # 獲取圖像嵌入
367
+ if isinstance(image, np.ndarray) and len(image.shape) == 1:
368
+ # 已經是嵌入向量
369
+ image_features = torch.tensor(image).unsqueeze(0).to(self.device)
370
+ else:
371
+ # 是圖像,需要提取嵌入
372
+ image_features = torch.tensor(self.get_image_embedding(image)).unsqueeze(0).to(self.device)
373
+
374
+ # calulate similarity
375
+ text_tokens = clip.tokenize(text_queries).to(self.device)
376
+
377
+ with torch.no_grad():
378
+ text_features = self.model.encode_text(text_tokens)
379
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
380
+
381
+ similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
382
+ similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
383
+
384
+ # display results
385
+ result = {}
386
+ for i, query in enumerate(text_queries):
387
+ result[query] = float(similarity[i])
388
+
389
+ return result
clip_prompts.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # 場景類型提示
3
+ SCENE_TYPE_PROMPTS = {
4
+ # 基本室內場景
5
+ "living_room": "A photo of a living room with furniture and entertainment systems.",
6
+ "bedroom": "A photo of a bedroom with a bed and personal items.",
7
+ "dining_area": "A photo of a dining area with a table and chairs for meals.",
8
+ "kitchen": "A photo of a kitchen with cooking appliances and food preparation areas.",
9
+ "office_workspace": "A photo of an office workspace with desk, computer and work equipment.",
10
+ "meeting_room": "A photo of a meeting room with a conference table and multiple chairs.",
11
+
12
+ # 基本室外/城市場景
13
+ "city_street": "A photo of a city street with traffic, pedestrians and urban buildings.",
14
+ "parking_lot": "A photo of a parking lot with multiple parked vehicles.",
15
+ "park_area": "A photo of a park or recreational area with greenery and outdoor facilities.",
16
+ "retail_store": "A photo of a retail store with merchandise displays and shopping areas.",
17
+ "supermarket": "A photo of a supermarket with food items, aisles and shopping carts.",
18
+
19
+ # 特殊室內場景
20
+ "upscale_dining": "A photo of an upscale dining area with elegant furniture and refined decor.",
21
+ "conference_room": "A photo of a professional conference room with presentation equipment and seating.",
22
+ "classroom": "A photo of a classroom with desks, chairs and educational equipment.",
23
+ "library": "A photo of a library with bookshelves, reading areas and study spaces.",
24
+
25
+ # 亞洲特色場景
26
+ "asian_commercial_street": "A photo of an Asian commercial street with dense signage, shops and pedestrians.",
27
+ "asian_night_market": "A photo of an Asian night market with food stalls, crowds and colorful lights.",
28
+ "asian_temple_area": "A photo of an Asian temple with traditional architecture and cultural elements.",
29
+
30
+ # 交通相關場景
31
+ "financial_district": "A photo of a financial district with tall office buildings and business activity.",
32
+ "urban_intersection": "A photo of an urban intersection with crosswalks, traffic lights and pedestrians crossing.",
33
+ "transit_hub": "A photo of a transportation hub with multiple modes of public transit and passengers.",
34
+ "bus_stop": "A photo of a bus stop with people waiting and buses arriving or departing.",
35
+ "bus_station": "A photo of a bus terminal with multiple buses and traveler facilities.",
36
+ "train_station": "A photo of a train station with platforms, trains and passenger activity.",
37
+ "airport": "A photo of an airport with planes, terminals and traveler activity.",
38
+
39
+ # 商業場景
40
+ "shopping_district": "A photo of a shopping district with multiple retail stores and consumer activity.",
41
+ "cafe": "A photo of a cafe with coffee service, seating and casual dining.",
42
+ "restaurant": "A photo of a restaurant with dining tables, food service and eating areas.",
43
+
44
+ # 空中視角場景
45
+ "aerial_view_intersection": "An aerial view of an intersection showing crosswalks and traffic patterns from above.",
46
+ "aerial_view_commercial_area": "An aerial view of a commercial area showing shopping districts from above.",
47
+ "aerial_view_plaza": "An aerial view of a public plaza or square showing patterns of people movement from above.",
48
+
49
+ # 娛樂場景
50
+ "zoo": "A photo of a zoo with animal enclosures, exhibits and visitors.",
51
+ "playground": "A photo of a playground with recreational equipment and children playing.",
52
+ "sports_field": "A photo of a sports field with playing surfaces and athletic equipment.",
53
+ "sports_stadium": "A photo of a sports stadium with spectator seating and athletic facilities.",
54
+
55
+ # 水相關場景
56
+ "harbor": "A photo of a harbor with boats, docks and waterfront activity.",
57
+ "beach_water_recreation": "A photo of a beach area with water activities, sand and recreational equipment like surfboards.",
58
+
59
+ # 文化時間特定場景
60
+ "nighttime_street": "A photo of a street at night with artificial lighting and evening activity.",
61
+ "nighttime_commercial_district": "A photo of a commercial district at night with illuminated signs and evening shopping.",
62
+ "european_plaza": "A photo of a European-style plaza with historic architecture and public gathering spaces.",
63
+
64
+ # 混合環境場景
65
+ "indoor_outdoor_cafe": "A photo of a cafe with both indoor seating and outdoor patio areas.",
66
+ "transit_station_platform": "A photo of a transit station platform with waiting areas and arriving vehicles.",
67
+
68
+ # 工作場景
69
+ "construction_site": "A photo of a construction site with building materials, equipment and workers.",
70
+ "medical_facility": "A photo of a medical facility with healthcare equipment and professional staff.",
71
+ "educational_setting": "A photo of an educational setting with learning spaces and academic resources.",
72
+ "professional_kitchen": "A photo of a professional commercial kitchen with industrial cooking equipment and food preparation stations."
73
+ }
74
+
75
+ # 文化特定場景提示
76
+ CULTURAL_SCENE_PROMPTS = {
77
+ "asian_commercial_street": [
78
+ "A busy Asian shopping street with neon signs and dense storefronts.",
79
+ "A commercial street in Asia with multi-level signage and narrow walkways.",
80
+ "A street scene in Taiwan or Hong Kong with vertical signage and compact shops.",
81
+ "A crowded commercial alley in an Asian city with signs in Chinese characters.",
82
+ "A narrow shopping street in Asia with small shops on both sides.",
83
+ "An outdoor shopping district in an East Asian city with electronic billboards.",
84
+ "A bustling commercial street in Taiwan with food vendors and retail shops.",
85
+ "A pedestrian shopping area with Korean or Chinese signs and storefronts.",
86
+ "A daytime shopping street in an Asian urban center with vertical development."
87
+ ],
88
+ "asian_night_market": [
89
+ "A vibrant night market in Asia with food stalls and large crowds.",
90
+ "An evening street market in Taiwan with street food vendors and bright lights.",
91
+ "A busy night bazaar in Asia with illuminated stalls and local food.",
92
+ "A crowded night street food market in an Asian city with vendor carts.",
93
+ "An Asian night market with steam from cooking food and hanging lanterns.",
94
+ "A nocturnal food street in East Asia with vendor canopies and neon lights.",
95
+ "A bustling evening market with rows of food stalls and plastic stools.",
96
+ "A lively Asian street food scene at night with cooking stations and crowds."
97
+ ],
98
+ "asian_temple_area": [
99
+ "A traditional Asian temple with ornate roof details and religious symbols.",
100
+ "A Buddhist temple complex in East Asia with multiple pavilions and prayer areas.",
101
+ "A sacred site in Asia with incense burners and ceremonial elements.",
102
+ "A temple courtyard with stone statues and traditional Asian architecture.",
103
+ "A spiritual center in East Asia with pagoda-style structures and visitors.",
104
+ "An ancient temple site with Asian architectural elements and cultural symbols.",
105
+ "A religious compound with characteristic Asian roof curves and decorative features."
106
+ ],
107
+ "european_plaza": [
108
+ "A historic European city square with classical architecture and cafes.",
109
+ "An old-world plaza in Europe with cobblestone paving and historic buildings.",
110
+ "A public square in a European city with fountains and surrounding architecture.",
111
+ "A central plaza in Europe with outdoor seating areas and historic monuments.",
112
+ "A traditional European town square with surrounding shops and restaurants.",
113
+ "A historic gathering space in Europe with distinctive architecture and pedestrians."
114
+ ]
115
+ }
116
+
117
+ # 對比類別提示
118
+ COMPARATIVE_PROMPTS = {
119
+ "indoor_vs_outdoor": [
120
+ "An indoor shopping mall corridor with controlled lighting and storefronts.",
121
+ "An outdoor commercial street with natural lighting and urban storefronts.",
122
+ "An enclosed shopping gallery with artificial lighting and climate control.",
123
+ "An open-air market street with natural light and weather exposure."
124
+ ],
125
+ "professional_vs_home": [
126
+ "A professional commercial kitchen with stainless steel equipment and workstations.",
127
+ "A home kitchen with residential appliances and family cooking space.",
128
+ "A restaurant kitchen with multiple cooking stations and chef activity.",
129
+ "A family kitchen with standard household equipment and personal touches."
130
+ ],
131
+ "sports_venue_vs_park": [
132
+ "A professional sports stadium with designated playing areas and audience seating.",
133
+ "A public park with casual recreation space and community greenery.",
134
+ "An athletic venue with specialized sports equipment and competitive playing surfaces.",
135
+ "An outdoor community space with general purpose areas and natural elements."
136
+ ],
137
+ "asian_vs_western_commercial": [
138
+ "An Asian shopping street with vertical signage and compact multi-level shops.",
139
+ "A Western commercial street with horizontal storefronts and wider sidewalks.",
140
+ "An East Asian retail area with dense signage in Asian scripts and narrow walkways.",
141
+ "A Western shopping district with uniform building heights and Latin alphabetic signs."
142
+ ],
143
+ "daytime_vs_nighttime": [
144
+ "A daytime urban scene with natural sunlight illuminating streets and buildings.",
145
+ "A nighttime city scene with artificial lighting from stores, signs and streetlights.",
146
+ "A commercial district during daylight hours with natural shadows and visibility.",
147
+ "An evening urban setting with illuminated storefronts and light patterns on streets."
148
+ ],
149
+ "aerial_vs_street_level": [
150
+ "An aerial view showing urban patterns and layouts from above.",
151
+ "A street-level view showing pedestrian perspective and immediate surroundings.",
152
+ "A bird's-eye view of city organization and movement patterns from high above.",
153
+ "An eye-level perspective showing direct human interaction with urban elements."
154
+ ]
155
+ }
156
+
157
+ # 環境條件文本提示
158
+ LIGHTING_CONDITION_PROMPTS = {
159
+ "day_clear": "A photo taken during daytime with clear skies and direct sunlight.",
160
+ "day_cloudy": "A photo taken during daytime with overcast conditions and diffused light.",
161
+ "sunset/sunrise": "A photo taken during sunset or sunrise with warm golden lighting and long shadows.",
162
+ "night": "A photo taken at night with minimal natural light and artificial illumination.",
163
+ "indoor_bright": "An indoor photo with bright, even artificial lighting throughout the space.",
164
+ "indoor_moderate": "An indoor photo with moderate lighting creating a balanced indoor atmosphere.",
165
+ "indoor_dim": "An indoor photo with low lighting levels creating a subdued environment.",
166
+ "neon_night": "A night scene with colorful neon lighting creating vibrant illumination patterns.",
167
+ "indoor_commercial": "An indoor retail environment with directed display lighting highlighting products.",
168
+ "indoor_restaurant": "An indoor dining space with ambient mood lighting for atmosphere.",
169
+ "stadium_lighting": "A sports venue with powerful floodlights creating intense, even illumination.",
170
+ "mixed_lighting": "A scene with combined natural and artificial light sources creating transition zones.",
171
+ "beach_daylight": "A photo taken at a beach with bright natural sunlight and reflections from water.",
172
+ "sports_arena_lighting": "A photo of a sports venue illuminated by powerful overhead lighting systems.",
173
+ "kitchen_task_lighting": "A photo of a kitchen with focused lighting concentrated on work surfaces."
174
+ }
175
+
176
+ # 針對新場景類型的特殊提示
177
+ SPECIALIZED_SCENE_PROMPTS = {
178
+ "beach_water_recreation": [
179
+ "A coastal beach scene with people surfing and sunbathing on sandy shores.",
180
+ "Active water sports participants at a beach with surfboards and swimming areas.",
181
+ "A sunny beach destination with recreational water equipment and beachgoers.",
182
+ "A shoreline recreation area with surf gear and coastal activities.",
183
+ "An oceanfront scene with people engaging in water sports and beach leisure.",
184
+ "A popular beach spot with swimming areas and surfing zones.",
185
+ "A coastal recreation setting with beach umbrellas and water activities."
186
+ ],
187
+ "sports_venue": [
188
+ "An indoor sports arena with professional equipment and competition spaces.",
189
+ "A sports stadium with marked playing areas and spectator seating arrangement.",
190
+ "A specialized athletic venue with competition equipment and performance areas.",
191
+ "A professional sports facility with game-related apparatus and audience zones.",
192
+ "An organized sports center with competitive play areas and athletic equipment.",
193
+ "A competition venue with sport-specific markings and professional setup.",
194
+ "A formal athletic facility with standardized equipment and playing surfaces."
195
+ ],
196
+ "professional_kitchen": [
197
+ "A commercial restaurant kitchen with multiple cooking stations and food prep areas.",
198
+ "A professional culinary workspace with industrial appliances and chef activity.",
199
+ "A busy restaurant back-of-house with stainless steel equipment and meal preparation.",
200
+ "A commercial food service kitchen with chef workstations and specialized zones.",
201
+ "An industrial kitchen facility with specialized cooking equipment and prep surfaces.",
202
+ "A high-volume food production kitchen with professional-grade appliances.",
203
+ "A restaurant kitchen with distinct cooking areas and culinary workflow design."
204
+ ],
205
+ "urban_intersection": [
206
+ "A city intersection with crosswalks and traffic signals controlling movement.",
207
+ "A busy urban crossroad with pedestrian crossings and vehicle traffic.",
208
+ "A regulated street intersection with crosswalk markings and waiting pedestrians.",
209
+ "A metropolitan junction with traffic lights and pedestrian crossing zones.",
210
+ "A city street crossing with safety features for pedestrians and traffic flow.",
211
+ "A controlled urban intersection with movement patterns for vehicles and people.",
212
+ "A city center crossroad with traffic management features and pedestrian areas."
213
+ ],
214
+ "financial_district": [
215
+ "A downtown business area with tall office buildings and commercial activity.",
216
+ "An urban financial center with skyscrapers and professional environment.",
217
+ "A city's business district with corporate headquarters and office towers.",
218
+ "A metropolitan financial zone with high-rise buildings and business traffic.",
219
+ "A corporate district in a city center with professional architecture.",
220
+ "An urban area dominated by office buildings and business establishments.",
221
+ "A city's economic center with banking institutions and corporate offices."
222
+ ],
223
+ "aerial_view_intersection": [
224
+ "A bird's-eye view of a city intersection showing crossing patterns from above.",
225
+ "An overhead perspective of an urban crossroad showing traffic organization.",
226
+ "A top-down view of a street intersection revealing pedestrian crosswalks.",
227
+ "An aerial shot of a city junction showing the layout of roads and crossings.",
228
+ "A high-angle view of an intersection showing traffic and pedestrian flow patterns.",
229
+ "A drone perspective of urban crossing design viewed from directly above.",
230
+ "A vertical view of a street intersection showing crossing infrastructure."
231
+ ]
232
+ }
233
+
234
+ VIEWPOINT_PROMPTS = {
235
+ "eye_level": "A photo taken from normal human eye level showing a direct frontal perspective.",
236
+ "aerial": "A photo taken from high above looking directly down at the scene below.",
237
+ "elevated": "A photo taken from a higher than normal position looking down at an angle.",
238
+ "low_angle": "A photo taken from a low position looking upward at the scene.",
239
+ "bird_eye": "A photo taken from very high above showing a complete overhead perspective.",
240
+ "street_level": "A photo taken from the perspective of someone standing on the street.",
241
+ "interior": "A photo taken from inside a building showing the internal environment.",
242
+ "vehicular": "A photo taken from inside or mounted on a moving vehicle."
243
+ }
244
+
245
+ OBJECT_COMBINATION_PROMPTS = {
246
+ "dining_setting": "A scene with tables, chairs, plates, and eating utensils arranged for meals.",
247
+ "office_setup": "A scene with desks, chairs, computers, and office supplies for work.",
248
+ "living_space": "A scene with sofas, coffee tables, TVs, and comfortable seating arrangements.",
249
+ "transportation_hub": "A scene with vehicles, waiting areas, passengers, and transit information.",
250
+ "retail_environment": "A scene with merchandise displays, shoppers, and store fixtures.",
251
+ "crosswalk_scene": "A scene with street markings, pedestrians crossing, and traffic signals.",
252
+ "cooking_area": "A scene with stoves, prep surfaces, cooking utensils, and food items.",
253
+ "recreational_space": "A scene with sports equipment, play areas, and activity participants."
254
+ }
255
+
256
+ ACTIVITY_PROMPTS = {
257
+ "shopping": "People looking at merchandise, carrying shopping bags, and browsing stores.",
258
+ "dining": "People eating food, sitting at tables, and using dining utensils.",
259
+ "commuting": "People waiting for transportation, boarding vehicles, and traveling.",
260
+ "working": "People using computers, attending meetings, and engaged in professional tasks.",
261
+ "exercising": "People engaged in physical activities, using sports equipment, and training.",
262
+ "cooking": "People preparing food, using kitchen equipment, and creating meals.",
263
+ "crossing_street": "People walking across designated crosswalks and navigating intersections.",
264
+ "recreational_activity": "People engaged in leisure activities, games, and social recreation."
265
+ }
color_mapper.py CHANGED
@@ -6,7 +6,7 @@ class ColorMapper:
6
  A class for consistent color mapping of object detection classes
7
  Provides color schemes for visualization in both RGB and hex formats
8
  """
9
-
10
  # Class categories for better organization
11
  CATEGORIES = {
12
  "person": [0],
@@ -21,8 +21,9 @@ class ColorMapper:
21
  "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
22
  "household": [71, 72, 73, 74, 75, 76, 77, 78, 79]
23
  }
24
-
25
  # Base colors for each category (in HSV for easier variation)
 
26
  CATEGORY_COLORS = {
27
  "person": (0, 0.8, 0.9), # Red
28
  "vehicles": (210, 0.8, 0.9), # Blue
@@ -36,43 +37,43 @@ class ColorMapper:
36
  "electronics": (240, 0.6, 0.9), # Light Blue
37
  "household": (60, 0.6, 0.9) # Yellow
38
  }
39
-
40
  def __init__(self):
41
  """Initialize the ColorMapper with COCO class mappings"""
42
  self.class_names = self._get_coco_classes()
43
  self.color_map = self._generate_color_map()
44
-
45
  def _get_coco_classes(self) -> Dict[int, str]:
46
  """Get the standard COCO class names with their IDs"""
47
  return {
48
  0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
49
  5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
50
- 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
51
  14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
52
  20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
53
  25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
54
  30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
55
- 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
56
  39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
57
- 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
58
  49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
59
- 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
60
  59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
61
- 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
62
  69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
63
- 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
64
  79: 'toothbrush'
65
  }
66
-
67
  def _hsv_to_rgb(self, h: float, s: float, v: float) -> Tuple[int, int, int]:
68
  """
69
  Convert HSV color to RGB
70
-
71
  Args:
72
  h: Hue (0-360)
73
  s: Saturation (0-1)
74
  v: Value (0-1)
75
-
76
  Returns:
77
  Tuple of (R, G, B) values (0-255)
78
  """
@@ -82,7 +83,7 @@ class ColorMapper:
82
  p = v * (1 - s)
83
  q = v * (1 - s * f)
84
  t = v * (1 - s * (1 - f))
85
-
86
  if i == 0:
87
  r, g, b = v, t, p
88
  elif i == 1:
@@ -95,28 +96,28 @@ class ColorMapper:
95
  r, g, b = t, p, v
96
  else:
97
  r, g, b = v, p, q
98
-
99
  return (int(r * 255), int(g * 255), int(b * 255))
100
-
101
  def _rgb_to_hex(self, rgb: Tuple[int, int, int]) -> str:
102
  """
103
  Convert RGB color to hex color code
104
-
105
  Args:
106
  rgb: Tuple of (R, G, B) values (0-255)
107
-
108
  Returns:
109
  Hex color code (e.g. '#FF0000')
110
  """
111
  return f'#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}'
112
-
113
  def _find_category(self, class_id: int) -> str:
114
  """
115
  Find the category for a given class ID
116
-
117
  Args:
118
  class_id: Class ID (0-79)
119
-
120
  Returns:
121
  Category name
122
  """
@@ -124,11 +125,11 @@ class ColorMapper:
124
  if class_id in ids:
125
  return category
126
  return "other" # Fallback
127
-
128
  def _generate_color_map(self) -> Dict:
129
  """
130
  Generate a color map for all 80 COCO classes
131
-
132
  Returns:
133
  Dictionary mapping class IDs and names to color values
134
  """
@@ -137,7 +138,7 @@ class ColorMapper:
137
  'by_name': {}, # Map class name to RGB and hex
138
  'categories': {} # Map category to base color
139
  }
140
-
141
  # Generate colors for categories
142
  for category, hsv in self.CATEGORY_COLORS.items():
143
  rgb = self._hsv_to_rgb(hsv[0], hsv[1], hsv[2])
@@ -146,54 +147,54 @@ class ColorMapper:
146
  'rgb': rgb,
147
  'hex': hex_color
148
  }
149
-
150
  # Generate variations for each class within a category
151
  for class_id, class_name in self.class_names.items():
152
  category = self._find_category(class_id)
153
  base_hsv = self.CATEGORY_COLORS.get(category, (0, 0, 0.8)) # Default gray
154
-
155
  # Slightly vary the hue and saturation within the category
156
  ids_in_category = self.CATEGORIES.get(category, [])
157
  if ids_in_category:
158
  position = ids_in_category.index(class_id) if class_id in ids_in_category else 0
159
  variation = position / max(1, len(ids_in_category) - 1) # 0 to 1
160
-
161
  # Vary hue slightly (±15°) and saturation
162
  h_offset = 30 * variation - 15 # -15 to +15
163
  s_offset = 0.2 * variation # 0 to 0.2
164
-
165
  h = (base_hsv[0] + h_offset) % 360
166
  s = min(1.0, base_hsv[1] + s_offset)
167
  v = base_hsv[2]
168
  else:
169
  h, s, v = base_hsv
170
-
171
  rgb = self._hsv_to_rgb(h, s, v)
172
  hex_color = self._rgb_to_hex(rgb)
173
-
174
  # Store in both mappings
175
  color_map['by_id'][class_id] = {
176
  'rgb': rgb,
177
  'hex': hex_color,
178
  'category': category
179
  }
180
-
181
  color_map['by_name'][class_name] = {
182
  'rgb': rgb,
183
  'hex': hex_color,
184
  'category': category
185
  }
186
-
187
  return color_map
188
-
189
  def get_color(self, class_identifier: Union[int, str], format: str = 'hex') -> Any:
190
  """
191
  Get color for a specific class
192
-
193
  Args:
194
  class_identifier: Class ID (int) or name (str)
195
  format: Color format ('hex', 'rgb', or 'bgr')
196
-
197
  Returns:
198
  Color in requested format
199
  """
@@ -202,11 +203,11 @@ class ColorMapper:
202
  color_info = self.color_map['by_id'].get(class_identifier)
203
  else:
204
  color_info = self.color_map['by_name'].get(class_identifier)
205
-
206
  if not color_info:
207
  # Fallback color if not found
208
  return '#CCCCCC' if format == 'hex' else (204, 204, 204)
209
-
210
  if format == 'hex':
211
  return color_info['hex']
212
  elif format == 'rgb':
@@ -217,14 +218,14 @@ class ColorMapper:
217
  return (b, g, r)
218
  else:
219
  return color_info['rgb']
220
-
221
  def get_all_colors(self, format: str = 'hex') -> Dict:
222
  """
223
  Get all colors in the specified format
224
-
225
  Args:
226
  format: Color format ('hex', 'rgb', or 'bgr')
227
-
228
  Returns:
229
  Dictionary mapping class names to colors
230
  """
@@ -232,14 +233,14 @@ class ColorMapper:
232
  for class_id, class_name in self.class_names.items():
233
  result[class_name] = self.get_color(class_id, format)
234
  return result
235
-
236
  def get_category_colors(self, format: str = 'hex') -> Dict:
237
  """
238
  Get base colors for each category
239
-
240
  Args:
241
  format: Color format ('hex', 'rgb', or 'bgr')
242
-
243
  Returns:
244
  Dictionary mapping categories to colors
245
  """
@@ -253,14 +254,14 @@ class ColorMapper:
253
  else:
254
  result[category] = color_info['rgb']
255
  return result
256
-
257
  def get_category_for_class(self, class_identifier: Union[int, str]) -> str:
258
  """
259
  Get the category for a specific class
260
-
261
  Args:
262
  class_identifier: Class ID (int) or name (str)
263
-
264
  Returns:
265
  Category name
266
  """
 
6
  A class for consistent color mapping of object detection classes
7
  Provides color schemes for visualization in both RGB and hex formats
8
  """
9
+
10
  # Class categories for better organization
11
  CATEGORIES = {
12
  "person": [0],
 
21
  "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
22
  "household": [71, 72, 73, 74, 75, 76, 77, 78, 79]
23
  }
24
+
25
  # Base colors for each category (in HSV for easier variation)
26
+ # HSV: Hue, Saturation, Value
27
  CATEGORY_COLORS = {
28
  "person": (0, 0.8, 0.9), # Red
29
  "vehicles": (210, 0.8, 0.9), # Blue
 
37
  "electronics": (240, 0.6, 0.9), # Light Blue
38
  "household": (60, 0.6, 0.9) # Yellow
39
  }
40
+
41
  def __init__(self):
42
  """Initialize the ColorMapper with COCO class mappings"""
43
  self.class_names = self._get_coco_classes()
44
  self.color_map = self._generate_color_map()
45
+
46
  def _get_coco_classes(self) -> Dict[int, str]:
47
  """Get the standard COCO class names with their IDs"""
48
  return {
49
  0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
50
  5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
51
+ 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
52
  14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
53
  20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
54
  25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
55
  30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
56
+ 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
57
  39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
58
+ 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
59
  49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
60
+ 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
61
  59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
62
+ 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
63
  69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
64
+ 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
65
  79: 'toothbrush'
66
  }
67
+
68
  def _hsv_to_rgb(self, h: float, s: float, v: float) -> Tuple[int, int, int]:
69
  """
70
  Convert HSV color to RGB
71
+
72
  Args:
73
  h: Hue (0-360)
74
  s: Saturation (0-1)
75
  v: Value (0-1)
76
+
77
  Returns:
78
  Tuple of (R, G, B) values (0-255)
79
  """
 
83
  p = v * (1 - s)
84
  q = v * (1 - s * f)
85
  t = v * (1 - s * (1 - f))
86
+
87
  if i == 0:
88
  r, g, b = v, t, p
89
  elif i == 1:
 
96
  r, g, b = t, p, v
97
  else:
98
  r, g, b = v, p, q
99
+
100
  return (int(r * 255), int(g * 255), int(b * 255))
101
+
102
  def _rgb_to_hex(self, rgb: Tuple[int, int, int]) -> str:
103
  """
104
  Convert RGB color to hex color code
105
+
106
  Args:
107
  rgb: Tuple of (R, G, B) values (0-255)
108
+
109
  Returns:
110
  Hex color code (e.g. '#FF0000')
111
  """
112
  return f'#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}'
113
+
114
  def _find_category(self, class_id: int) -> str:
115
  """
116
  Find the category for a given class ID
117
+
118
  Args:
119
  class_id: Class ID (0-79)
120
+
121
  Returns:
122
  Category name
123
  """
 
125
  if class_id in ids:
126
  return category
127
  return "other" # Fallback
128
+
129
  def _generate_color_map(self) -> Dict:
130
  """
131
  Generate a color map for all 80 COCO classes
132
+
133
  Returns:
134
  Dictionary mapping class IDs and names to color values
135
  """
 
138
  'by_name': {}, # Map class name to RGB and hex
139
  'categories': {} # Map category to base color
140
  }
141
+
142
  # Generate colors for categories
143
  for category, hsv in self.CATEGORY_COLORS.items():
144
  rgb = self._hsv_to_rgb(hsv[0], hsv[1], hsv[2])
 
147
  'rgb': rgb,
148
  'hex': hex_color
149
  }
150
+
151
  # Generate variations for each class within a category
152
  for class_id, class_name in self.class_names.items():
153
  category = self._find_category(class_id)
154
  base_hsv = self.CATEGORY_COLORS.get(category, (0, 0, 0.8)) # Default gray
155
+
156
  # Slightly vary the hue and saturation within the category
157
  ids_in_category = self.CATEGORIES.get(category, [])
158
  if ids_in_category:
159
  position = ids_in_category.index(class_id) if class_id in ids_in_category else 0
160
  variation = position / max(1, len(ids_in_category) - 1) # 0 to 1
161
+
162
  # Vary hue slightly (±15°) and saturation
163
  h_offset = 30 * variation - 15 # -15 to +15
164
  s_offset = 0.2 * variation # 0 to 0.2
165
+
166
  h = (base_hsv[0] + h_offset) % 360
167
  s = min(1.0, base_hsv[1] + s_offset)
168
  v = base_hsv[2]
169
  else:
170
  h, s, v = base_hsv
171
+
172
  rgb = self._hsv_to_rgb(h, s, v)
173
  hex_color = self._rgb_to_hex(rgb)
174
+
175
  # Store in both mappings
176
  color_map['by_id'][class_id] = {
177
  'rgb': rgb,
178
  'hex': hex_color,
179
  'category': category
180
  }
181
+
182
  color_map['by_name'][class_name] = {
183
  'rgb': rgb,
184
  'hex': hex_color,
185
  'category': category
186
  }
187
+
188
  return color_map
189
+
190
  def get_color(self, class_identifier: Union[int, str], format: str = 'hex') -> Any:
191
  """
192
  Get color for a specific class
193
+
194
  Args:
195
  class_identifier: Class ID (int) or name (str)
196
  format: Color format ('hex', 'rgb', or 'bgr')
197
+
198
  Returns:
199
  Color in requested format
200
  """
 
203
  color_info = self.color_map['by_id'].get(class_identifier)
204
  else:
205
  color_info = self.color_map['by_name'].get(class_identifier)
206
+
207
  if not color_info:
208
  # Fallback color if not found
209
  return '#CCCCCC' if format == 'hex' else (204, 204, 204)
210
+
211
  if format == 'hex':
212
  return color_info['hex']
213
  elif format == 'rgb':
 
218
  return (b, g, r)
219
  else:
220
  return color_info['rgb']
221
+
222
  def get_all_colors(self, format: str = 'hex') -> Dict:
223
  """
224
  Get all colors in the specified format
225
+
226
  Args:
227
  format: Color format ('hex', 'rgb', or 'bgr')
228
+
229
  Returns:
230
  Dictionary mapping class names to colors
231
  """
 
233
  for class_id, class_name in self.class_names.items():
234
  result[class_name] = self.get_color(class_id, format)
235
  return result
236
+
237
  def get_category_colors(self, format: str = 'hex') -> Dict:
238
  """
239
  Get base colors for each category
240
+
241
  Args:
242
  format: Color format ('hex', 'rgb', or 'bgr')
243
+
244
  Returns:
245
  Dictionary mapping categories to colors
246
  """
 
254
  else:
255
  result[category] = color_info['rgb']
256
  return result
257
+
258
  def get_category_for_class(self, class_identifier: Union[int, str]) -> str:
259
  """
260
  Get the category for a specific class
261
+
262
  Args:
263
  class_identifier: Class ID (int) or name (str)
264
+
265
  Returns:
266
  Category name
267
  """
confifence_templates.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ CONFIDENCE_TEMPLATES = {
3
+ "high": "{description} {details}",
4
+ "medium": "This appears to be {description} {details}",
5
+ "low": "This might be {description}, but the confidence is low. {details}"
6
+ }
cultural_templates.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ CULTURAL_TEMPLATES = {
3
+ "asian": {
4
+ "elements": ["character signage", "lanterns", "dense urban layout"],
5
+ "description": "The scene shows distinctive Asian cultural elements such as {elements}."
6
+ },
7
+ "european": {
8
+ "elements": ["classical architecture", "cobblestone streets", "café terraces"],
9
+ "description": "The environment has European characteristics including {elements}."
10
+ },
11
+ "middle_eastern": {
12
+ "elements": ["ornate archways", "geometric patterns", "domed structures"],
13
+ "description": "The scene contains Middle Eastern architectural features such as {elements}."
14
+ },
15
+ "north_american": {
16
+ "elements": ["grid street pattern", "modern skyscrapers", "wide boulevards"],
17
+ "description": "The layout shows typical North American urban design with {elements}."
18
+ }
19
+ }
detection_model.py CHANGED
@@ -6,7 +6,7 @@ import os
6
 
7
  class DetectionModel:
8
  """Core detection model class for object detection using YOLOv8"""
9
-
10
  # Model information dictionary
11
  MODEL_INFO = {
12
  "yolov8n.pt": {
@@ -28,11 +28,11 @@ class DetectionModel:
28
  "inference_speed": "Slower"
29
  }
30
  }
31
-
32
- def __init__(self, model_name: str = 'yolov8m.pt', confidence: float = 0.25, iou: float = 0.45):
33
  """
34
  Initialize the detection model
35
-
36
  Args:
37
  model_name: Model name or path, default is yolov8m.pt
38
  confidence: Confidence threshold, default is 0.25
@@ -44,10 +44,10 @@ class DetectionModel:
44
  self.model = None
45
  self.class_names = {}
46
  self.is_model_loaded = False
47
-
48
  # Load model on initialization
49
  self._load_model()
50
-
51
  def _load_model(self):
52
  """Load the YOLO model"""
53
  try:
@@ -60,57 +60,57 @@ class DetectionModel:
60
  except Exception as e:
61
  print(f"Error occurred when loading the model: {e}")
62
  self.is_model_loaded = False
63
-
64
  def change_model(self, new_model_name: str) -> bool:
65
  """
66
  Change the currently loaded model
67
-
68
  Args:
69
  new_model_name: Name of the new model to load
70
-
71
  Returns:
72
  bool: True if model changed successfully, False otherwise
73
  """
74
  if self.model_name == new_model_name and self.is_model_loaded:
75
  print(f"Model {new_model_name} is already loaded")
76
  return True
77
-
78
  print(f"Changing model from {self.model_name} to {new_model_name}")
79
-
80
  # Unload current model to free memory
81
  if self.model is not None:
82
  del self.model
83
  self.model = None
84
-
85
  # Clean GPU memory if available
86
  if torch.cuda.is_available():
87
  torch.cuda.empty_cache()
88
-
89
  # Update model name and load new model
90
  self.model_name = new_model_name
91
  self._load_model()
92
-
93
  return self.is_model_loaded
94
-
95
  def reload_model(self):
96
  """Reload the model (useful for changing model or after error)"""
97
  if self.model is not None:
98
  del self.model
99
  self.model = None
100
-
101
  # Clean GPU memory if available
102
  if torch.cuda.is_available():
103
  torch.cuda.empty_cache()
104
-
105
  self._load_model()
106
-
107
  def detect(self, image_input: Any) -> Optional[Any]:
108
  """
109
  Perform object detection on a single image
110
-
111
  Args:
112
  image_input: Image path (str), PIL Image, or numpy array
113
-
114
  Returns:
115
  Detection result object or None if error occurred
116
  """
@@ -120,27 +120,27 @@ class DetectionModel:
120
  if self.model is None or not self.is_model_loaded:
121
  print("Failed to load model. Cannot perform detection.")
122
  return None
123
-
124
  try:
125
  results = self.model(image_input, conf=self.confidence, iou=self.iou)
126
  return results[0]
127
  except Exception as e:
128
  print(f"Error occurred during detection: {e}")
129
  return None
130
-
131
  def get_class_names(self, class_id: int) -> str:
132
  """Get class name for a given class ID"""
133
  return self.class_names.get(class_id, "Unknown Class")
134
-
135
  def get_supported_classes(self) -> Dict[int, str]:
136
  """Get all supported classes as a dictionary of {id: class_name}"""
137
  return self.class_names
138
-
139
  @classmethod
140
  def get_available_models(cls) -> List[Dict]:
141
  """
142
  Get list of available models with their information
143
-
144
  Returns:
145
  List of dictionaries containing model information
146
  """
@@ -154,7 +154,7 @@ class DetectionModel:
154
  "inference_speed": info["inference_speed"]
155
  })
156
  return models
157
-
158
  @classmethod
159
  def get_model_description(cls, model_name: str) -> str:
160
  """Get description for a specific model"""
 
6
 
7
  class DetectionModel:
8
  """Core detection model class for object detection using YOLOv8"""
9
+
10
  # Model information dictionary
11
  MODEL_INFO = {
12
  "yolov8n.pt": {
 
28
  "inference_speed": "Slower"
29
  }
30
  }
31
+
32
+ def __init__(self, model_name: str = 'yolov8m.pt', confidence: float = 0.25, iou: float = 0.25):
33
  """
34
  Initialize the detection model
35
+
36
  Args:
37
  model_name: Model name or path, default is yolov8m.pt
38
  confidence: Confidence threshold, default is 0.25
 
44
  self.model = None
45
  self.class_names = {}
46
  self.is_model_loaded = False
47
+
48
  # Load model on initialization
49
  self._load_model()
50
+
51
  def _load_model(self):
52
  """Load the YOLO model"""
53
  try:
 
60
  except Exception as e:
61
  print(f"Error occurred when loading the model: {e}")
62
  self.is_model_loaded = False
63
+
64
  def change_model(self, new_model_name: str) -> bool:
65
  """
66
  Change the currently loaded model
67
+
68
  Args:
69
  new_model_name: Name of the new model to load
70
+
71
  Returns:
72
  bool: True if model changed successfully, False otherwise
73
  """
74
  if self.model_name == new_model_name and self.is_model_loaded:
75
  print(f"Model {new_model_name} is already loaded")
76
  return True
77
+
78
  print(f"Changing model from {self.model_name} to {new_model_name}")
79
+
80
  # Unload current model to free memory
81
  if self.model is not None:
82
  del self.model
83
  self.model = None
84
+
85
  # Clean GPU memory if available
86
  if torch.cuda.is_available():
87
  torch.cuda.empty_cache()
88
+
89
  # Update model name and load new model
90
  self.model_name = new_model_name
91
  self._load_model()
92
+
93
  return self.is_model_loaded
94
+
95
  def reload_model(self):
96
  """Reload the model (useful for changing model or after error)"""
97
  if self.model is not None:
98
  del self.model
99
  self.model = None
100
+
101
  # Clean GPU memory if available
102
  if torch.cuda.is_available():
103
  torch.cuda.empty_cache()
104
+
105
  self._load_model()
106
+
107
  def detect(self, image_input: Any) -> Optional[Any]:
108
  """
109
  Perform object detection on a single image
110
+
111
  Args:
112
  image_input: Image path (str), PIL Image, or numpy array
113
+
114
  Returns:
115
  Detection result object or None if error occurred
116
  """
 
120
  if self.model is None or not self.is_model_loaded:
121
  print("Failed to load model. Cannot perform detection.")
122
  return None
123
+
124
  try:
125
  results = self.model(image_input, conf=self.confidence, iou=self.iou)
126
  return results[0]
127
  except Exception as e:
128
  print(f"Error occurred during detection: {e}")
129
  return None
130
+
131
  def get_class_names(self, class_id: int) -> str:
132
  """Get class name for a given class ID"""
133
  return self.class_names.get(class_id, "Unknown Class")
134
+
135
  def get_supported_classes(self) -> Dict[int, str]:
136
  """Get all supported classes as a dictionary of {id: class_name}"""
137
  return self.class_names
138
+
139
  @classmethod
140
  def get_available_models(cls) -> List[Dict]:
141
  """
142
  Get list of available models with their information
143
+
144
  Returns:
145
  List of dictionaries containing model information
146
  """
 
154
  "inference_speed": info["inference_speed"]
155
  })
156
  return models
157
+
158
  @classmethod
159
  def get_model_description(cls, model_name: str) -> str:
160
  """Get description for a specific model"""
enhance_scene_describer.py ADDED
@@ -0,0 +1,1314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import random
5
+ import numpy as np
6
+ from typing import Dict, List, Tuple, Any, Optional
7
+
8
+ from scene_type import SCENE_TYPES
9
+ from scene_detail_templates import SCENE_DETAIL_TEMPLATES
10
+ from object_template_fillers import OBJECT_TEMPLATE_FILLERS
11
+ from lighting_conditions import LIGHTING_CONDITIONS
12
+ from viewpoint_templates import VIEWPOINT_TEMPLATES
13
+ from cultural_templates import CULTURAL_TEMPLATES
14
+ from confifence_templates import CONFIDENCE_TEMPLATES
15
+
16
+ class EnhancedSceneDescriber:
17
+ """
18
+ Enhanced scene description generator with improved template handling,
19
+ viewpoint awareness, and cultural context recognition.
20
+ Provides detailed natural language descriptions of scenes based on
21
+ detection results and scene classification.
22
+ """
23
+
24
+ def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None):
25
+ """
26
+ Initialize the enhanced scene describer.
27
+
28
+ Args:
29
+ templates_db: Optional custom templates database
30
+ scene_types: Dictionary of scene type definitions
31
+ """
32
+ # Load or use provided scene types
33
+ self.scene_types = scene_types or self._load_default_scene_types()
34
+
35
+ # Load templates database
36
+ self.templates = templates_db or self._load_templates()
37
+
38
+ # Initialize viewpoint detection parameters
39
+ self._initialize_viewpoint_parameters()
40
+
41
+ def _load_default_scene_types(self) -> Dict:
42
+ """
43
+ Load default scene types.
44
+
45
+ Returns:
46
+ Dict: Scene type definitions
47
+ """
48
+
49
+ return SCENE_TYPES
50
+
51
+ def _load_templates(self) -> Dict:
52
+ """
53
+ Load description templates from imported Python modules.
54
+
55
+ Returns:
56
+ Dict: Template collections for different description components
57
+ """
58
+ templates = {}
59
+
60
+ # 直接從導入的 Python 模組中獲取模板
61
+ templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
62
+ templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
63
+ templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
64
+ templates["cultural_templates"] = CULTURAL_TEMPLATES
65
+
66
+ # 從 LIGHTING_CONDITIONS 獲取照明模板
67
+ templates["lighting_templates"] = {
68
+ key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items()
69
+ }
70
+
71
+ # 設置默認的置信度模板
72
+ templates["confidence_templates"] = {
73
+ "high": "{description} {details}",
74
+ "medium": "This appears to be {description} {details}",
75
+ "low": "This might be {description}, but the confidence is low. {details}"
76
+ }
77
+
78
+ # 初始化其他必要的模板(現在這個函數簡化了很多)
79
+ self._initialize_default_templates(templates)
80
+
81
+ return templates
82
+
83
+ def _initialize_default_templates(self, templates: Dict):
84
+ """
85
+ 檢查模板字典並填充任何缺失的默認模板。
86
+
87
+ 在將模板移至專門的模組後,此方法主要作為安全機制,
88
+ 確保即使導入失敗或某些模板未在外部定義,系統仍能正常運行。
89
+
90
+ Args:
91
+ templates: 要檢查和更新的模板字典
92
+ """
93
+ # 檢查關鍵模板類型是否存在,如果不存在則添加默認值
94
+
95
+ # 置信度模板 - 用於控制描述的語氣
96
+ if "confidence_templates" not in templates:
97
+ templates["confidence_templates"] = {
98
+ "high": "{description} {details}",
99
+ "medium": "This appears to be {description} {details}",
100
+ "low": "This might be {description}, but the confidence is low. {details}"
101
+ }
102
+
103
+ # 場景細節模板 - 如果未從外部導入
104
+ if "scene_detail_templates" not in templates:
105
+ templates["scene_detail_templates"] = {
106
+ "default": ["A space with various objects."]
107
+ }
108
+
109
+ # 物體填充模板 - 用於生成物體描述
110
+ if "object_template_fillers" not in templates:
111
+ templates["object_template_fillers"] = {
112
+ "default": ["various items"]
113
+ }
114
+
115
+ # 視角模板 - 雖然我們現在從專門模組導入,但作為備份
116
+ if "viewpoint_templates" not in templates:
117
+ # 使用簡化版的默認視角模板
118
+ templates["viewpoint_templates"] = {
119
+ "eye_level": {
120
+ "prefix": "From eye level, ",
121
+ "observation": "the scene is viewed straight on."
122
+ },
123
+ "aerial": {
124
+ "prefix": "From above, ",
125
+ "observation": "the scene is viewed from a bird's-eye perspective."
126
+ }
127
+ }
128
+
129
+ # 文化模板
130
+ if "cultural_templates" not in templates:
131
+ templates["cultural_templates"] = {
132
+ "asian": {
133
+ "elements": ["cultural elements"],
134
+ "description": "The scene has Asian characteristics."
135
+ },
136
+ "european": {
137
+ "elements": ["architectural features"],
138
+ "description": "The scene has European characteristics."
139
+ }
140
+ }
141
+
142
+ # 照明模板 - 用於描述光照條件
143
+ if "lighting_templates" not in templates:
144
+ templates["lighting_templates"] = {
145
+ "day_clear": "The scene is captured during daylight.",
146
+ "night": "The scene is captured at night.",
147
+ "unknown": "The lighting conditions are not easily determined."
148
+ }
149
+
150
+ def _initialize_viewpoint_parameters(self):
151
+ """
152
+ Initialize parameters used for viewpoint detection.
153
+ """
154
+ self.viewpoint_params = {
155
+ # Parameters for detecting aerial views
156
+ "aerial_threshold": 0.7, # High object density viewed from top
157
+ "aerial_size_variance_threshold": 0.15, # Low size variance in aerial views
158
+
159
+ # Parameters for detecting low angle views
160
+ "low_angle_threshold": 0.3, # Bottom-heavy object distribution
161
+ "vertical_size_ratio_threshold": 1.8, # Vertical objects appear taller
162
+
163
+ # Parameters for detecting elevated views
164
+ "elevated_threshold": 0.6, # Objects mostly in middle/bottom
165
+ "elevated_top_threshold": 0.3 # Few objects at top of frame
166
+ }
167
+
168
+
169
+ def generate_description(self,
170
+ scene_type: str,
171
+ detected_objects: List[Dict],
172
+ confidence: float,
173
+ lighting_info: Optional[Dict] = None,
174
+ functional_zones: Optional[Dict] = None) -> str:
175
+ """
176
+ Generate enhanced scene description based on detection results, scene type,
177
+ and additional contextual information.
178
+
179
+ This is the main entry point that replaces the original _generate_scene_description.
180
+
181
+ Args:
182
+ scene_type: Identified scene type
183
+ detected_objects: List of detected objects
184
+ confidence: Scene classification confidence
185
+ lighting_info: Optional lighting condition information
186
+ functional_zones: Optional identified functional zones
187
+
188
+ Returns:
189
+ str: Natural language description of the scene
190
+ """
191
+ # Handle unknown scene type or very low confidence
192
+ if scene_type == "unknown" or confidence < 0.4:
193
+ return self._generate_generic_description(detected_objects, lighting_info)
194
+
195
+ # Detect viewpoint
196
+ viewpoint = self._detect_viewpoint(detected_objects)
197
+
198
+ if viewpoint == "aerial":
199
+ # 如果是十字路口相關的場景,確保使用正確的空中視角十字路口場景類型
200
+ if "intersection" in scene_type or self._is_intersection(detected_objects):
201
+ scene_type = "aerial_view_intersection"
202
+ # 如果是商業區相關的場景
203
+ elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
204
+ scene_type = "aerial_view_commercial_area"
205
+ # 如果是廣場相關的場景
206
+ elif any(keyword in scene_type for keyword in ["plaza", "square"]):
207
+ scene_type = "aerial_view_plaza"
208
+ # 其他空中視角場景,預設使用十字路口
209
+ else:
210
+ scene_type = "aerial_view_intersection"
211
+
212
+ # Detect cultural context - 只有在非空中視角時才檢測文化上下文
213
+ cultural_context = None
214
+ if viewpoint != "aerial":
215
+ cultural_context = self._detect_cultural_context(scene_type, detected_objects)
216
+
217
+ # Select appropriate template based on confidence
218
+ if confidence > 0.75:
219
+ confidence_level = "high"
220
+ elif confidence > 0.5:
221
+ confidence_level = "medium"
222
+ else:
223
+ confidence_level = "low"
224
+
225
+ # Get base description for the scene type
226
+ if viewpoint == "aerial":
227
+ # 空中視角時使用已設定的基本描述
228
+ if 'base_description' not in locals():
229
+ base_description = "An aerial view showing the layout and movement patterns from above"
230
+ elif scene_type in self.scene_types:
231
+ base_description = self.scene_types[scene_type].get("description", "A scene")
232
+ else:
233
+ base_description = "A scene"
234
+
235
+ # Generate detailed scene information
236
+ scene_details = self._generate_scene_details(
237
+ scene_type,
238
+ detected_objects,
239
+ lighting_info,
240
+ viewpoint
241
+ )
242
+
243
+ # 修正:根據人數改進描述
244
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
245
+ if people_objs:
246
+ people_count = len(people_objs)
247
+ if people_count > 5:
248
+ # 當人數很多��,用更精確的措辭
249
+ people_phrase = f"numerous people ({people_count})"
250
+ else:
251
+ people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
252
+
253
+ # 將人數信息加入到場景詳情中
254
+ if "people" not in scene_details.lower() and "pedestrian" not in scene_details.lower():
255
+ scene_details += f" The scene includes {people_phrase}."
256
+
257
+ # Apply cultural context if detected (只在非空中視角時應用)
258
+ if cultural_context and scene_details and viewpoint != "aerial":
259
+ cultural_elements = self._generate_cultural_elements(cultural_context)
260
+ if cultural_elements:
261
+ scene_details += f" {cultural_elements}"
262
+
263
+ # Include lighting information if available
264
+ lighting_description = ""
265
+ if lighting_info and "time_of_day" in lighting_info:
266
+ lighting_type = lighting_info["time_of_day"]
267
+ if lighting_type in self.templates.get("lighting_templates", {}):
268
+ lighting_description = self.templates["lighting_templates"][lighting_type]
269
+
270
+ # Apply confidence template
271
+ description_template = self.templates["confidence_templates"].get(
272
+ confidence_level, "{description} {details}"
273
+ )
274
+
275
+ # Fill the template
276
+ description = description_template.format(
277
+ description=base_description,
278
+ details=scene_details
279
+ )
280
+
281
+ # Add viewpoint observation if viewpoint is not standard
282
+ if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
283
+ viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
284
+
285
+ # 在空中視角時,確保觀察描述反映更多細節
286
+ if viewpoint == "aerial":
287
+ scene_elements = "the crossing patterns and pedestrian movement"
288
+ else:
289
+ scene_elements = "objects and layout"
290
+
291
+ viewpoint_desc = viewpoint_template.get("observation", "").format(
292
+ scene_elements=scene_elements
293
+ )
294
+
295
+ # Add viewpoint prefix if needed
296
+ if not description.startswith(viewpoint_template.get("prefix", "")):
297
+ description = f"{viewpoint_template.get('prefix', '')}{description}"
298
+
299
+ # Add viewpoint observation if not already included
300
+ if viewpoint_desc not in description:
301
+ description += f" {viewpoint_desc}"
302
+
303
+ # Add lighting description if available
304
+ if lighting_description and lighting_description not in description:
305
+ description += f" {lighting_description}"
306
+
307
+ # Add information about functional zones if available
308
+ if functional_zones and len(functional_zones) > 0:
309
+ zones_desc = self._describe_functional_zones(functional_zones)
310
+ if zones_desc:
311
+ description += f" {zones_desc}"
312
+
313
+ # 計算真實的人數
314
+ people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
315
+
316
+ # 檢查描述中是否有人數信息的矛盾
317
+ if people_count > 5:
318
+ # 識別可能含有較小人數信息的片段
319
+ small_people_patterns = [
320
+ r"Area with \d+ people\.",
321
+ r"Area with \d+ person\.",
322
+ r"with \d+ people",
323
+ r"with \d+ person"
324
+ ]
325
+ # 對每個模式檢查並移除
326
+ filtered_description = description
327
+ for pattern in small_people_patterns:
328
+ matches = re.findall(pattern, filtered_description)
329
+ for match in matches:
330
+ # 從匹配中提取人數
331
+ number_match = re.search(r'\d+', match)
332
+ if number_match:
333
+ try:
334
+ people_mentioned = int(number_match.group())
335
+ # 如果提到的人數小於總人數,移除整個句子
336
+ if people_mentioned < people_count:
337
+ # 將描述分割成句子
338
+ sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
339
+ # 移除包含匹配片段的句子
340
+ filtered_sentences = []
341
+ for sentence in sentences:
342
+ if match not in sentence:
343
+ filtered_sentences.append(sentence)
344
+ # 重新組合描述
345
+ filtered_description = " ".join(filtered_sentences)
346
+ except ValueError:
347
+ # 數字轉換失敗,繼續處理
348
+ continue
349
+
350
+ # 使用過濾後的描述
351
+ description = filtered_description
352
+
353
+ return description
354
+
355
+ def _is_intersection(self, detected_objects: List[Dict]) -> bool:
356
+ """
357
+ 通過分析物體分佈來判斷場景是否為十字路口
358
+ """
359
+ # 檢查行人分佈模式
360
+ pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
361
+
362
+ if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口
363
+ # 抓取行人位置
364
+ positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
365
+
366
+ # 分析 x 和 y 坐標分佈
367
+ x_coords = [pos[0] for pos in positions]
368
+ y_coords = [pos[1] for pos in positions]
369
+
370
+ # 計算 x 和 y 坐標的變異數
371
+ x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
372
+ y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
373
+
374
+ # 計算範圍
375
+ x_range = max(x_coords) - min(x_coords)
376
+ y_range = max(y_coords) - min(y_coords)
377
+
378
+ # 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口
379
+ if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
380
+ return True
381
+
382
+ return False
383
+
384
+ def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
385
+ """
386
+ Generate a generic description when scene type is unknown or confidence is very low.
387
+
388
+ Args:
389
+ detected_objects: List of detected objects
390
+ lighting_info: Optional lighting condition information
391
+
392
+ Returns:
393
+ str: Generic description based on detected objects
394
+ """
395
+ # Count object occurrences
396
+ obj_counts = {}
397
+ for obj in detected_objects:
398
+ class_name = obj["class_name"]
399
+ if class_name not in obj_counts:
400
+ obj_counts[class_name] = 0
401
+ obj_counts[class_name] += 1
402
+
403
+ # Get top objects by count
404
+ top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]
405
+
406
+ if not top_objects:
407
+ base_desc = "No clearly identifiable objects are visible in this scene."
408
+ else:
409
+ # Format object list
410
+ objects_text = []
411
+ for name, count in top_objects:
412
+ if count > 1:
413
+ objects_text.append(f"{count} {name}s")
414
+ else:
415
+ objects_text.append(name)
416
+
417
+ if len(objects_text) == 1:
418
+ objects_list = objects_text[0]
419
+ elif len(objects_text) == 2:
420
+ objects_list = f"{objects_text[0]} and {objects_text[1]}"
421
+ else:
422
+ objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"
423
+
424
+ base_desc = f"This scene contains {objects_list}."
425
+
426
+ # Add lighting information if available
427
+ if lighting_info and "time_of_day" in lighting_info:
428
+ lighting_type = lighting_info["time_of_day"]
429
+ if lighting_type in self.templates.get("lighting_templates", {}):
430
+ lighting_desc = self.templates["lighting_templates"][lighting_type]
431
+ base_desc += f" {lighting_desc}"
432
+
433
+ return base_desc
434
+
435
+ def _generate_scene_details(self,
436
+ scene_type: str,
437
+ detected_objects: List[Dict],
438
+ lighting_info: Optional[Dict] = None,
439
+ viewpoint: str = "eye_level") -> str:
440
+ """
441
+ Generate detailed description based on scene type and detected objects.
442
+
443
+ Args:
444
+ scene_type: Identified scene type
445
+ detected_objects: List of detected objects
446
+ lighting_info: Optional lighting condition information
447
+ viewpoint: Detected viewpoint (aerial, eye_level, etc.)
448
+
449
+ Returns:
450
+ str: Detailed scene description
451
+ """
452
+ # Get scene-specific templates
453
+ scene_details = ""
454
+ scene_templates = self.templates.get("scene_detail_templates", {})
455
+
456
+ # Handle specific scene types
457
+ if scene_type in scene_templates:
458
+ # Select a template appropriate for the viewpoint if available
459
+ viewpoint_key = f"{scene_type}_{viewpoint}"
460
+
461
+ if viewpoint_key in scene_templates:
462
+ # We have a viewpoint-specific template
463
+ templates_list = scene_templates[viewpoint_key]
464
+ else:
465
+ # Fall back to general templates for this scene type
466
+ templates_list = scene_templates[scene_type]
467
+
468
+ # Select a random template from the list
469
+ if templates_list:
470
+ detail_template = random.choice(templates_list)
471
+
472
+ # Fill the template with object information
473
+ scene_details = self._fill_detail_template(
474
+ detail_template,
475
+ detected_objects,
476
+ scene_type
477
+ )
478
+ else:
479
+ # Use default templates if specific ones aren't available
480
+ if "default" in scene_templates:
481
+ detail_template = random.choice(scene_templates["default"])
482
+ scene_details = self._fill_detail_template(
483
+ detail_template,
484
+ detected_objects,
485
+ "default"
486
+ )
487
+ else:
488
+ # Fall back to basic description if no templates are available
489
+ scene_details = self._generate_basic_details(scene_type, detected_objects)
490
+
491
+ return scene_details
492
+
493
+ def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str:
494
+ """
495
+ Fill a template with specific details based on detected objects.
496
+
497
+ Args:
498
+ template: Template string with placeholders
499
+ detected_objects: List of detected objects
500
+ scene_type: Identified scene type
501
+
502
+ Returns:
503
+ str: Filled template
504
+ """
505
+ # Find placeholders in the template using simple {placeholder} syntax
506
+ import re
507
+ placeholders = re.findall(r'\{([^}]+)\}', template)
508
+
509
+ filled_template = template
510
+
511
+ # Get object template fillers
512
+ fillers = self.templates.get("object_template_fillers", {})
513
+
514
+ # 為所有可能的變數設置默認值
515
+ default_replacements = {
516
+ # 室內相關
517
+ "furniture": "various furniture pieces",
518
+ "seating": "comfortable seating",
519
+ "electronics": "entertainment devices",
520
+ "bed_type": "a bed",
521
+ "bed_location": "room",
522
+ "bed_description": "sleeping arrangements",
523
+ "extras": "personal items",
524
+ "table_setup": "a dining table and chairs",
525
+ "table_description": "a dining surface",
526
+ "dining_items": "dining furniture and tableware",
527
+ "appliances": "kitchen appliances",
528
+ "kitchen_items": "cooking utensils and dishware",
529
+ "cooking_equipment": "cooking equipment",
530
+ "office_equipment": "work-related furniture and devices",
531
+ "desk_setup": "a desk and chair",
532
+ "computer_equipment": "electronic devices",
533
+
534
+ # 室外/城市相關
535
+ "traffic_description": "vehicles and pedestrians",
536
+ "people_and_vehicles": "people and various vehicles",
537
+ "street_elements": "urban infrastructure",
538
+ "park_features": "benches and greenery",
539
+ "outdoor_elements": "natural features",
540
+ "park_description": "outdoor amenities",
541
+ "store_elements": "merchandise displays",
542
+ "shopping_activity": "customers browse and shop",
543
+ "store_items": "products for sale",
544
+
545
+ # 高級餐廳相關
546
+ "design_elements": "elegant decor",
547
+ "lighting": "stylish lighting fixtures",
548
+
549
+ # 亞洲商業街相關
550
+ "storefront_features": "compact shops",
551
+ "pedestrian_flow": "people walking",
552
+ "asian_elements": "distinctive cultural elements",
553
+ "cultural_elements": "traditional design features",
554
+ "signage": "colorful signs",
555
+ "street_activities": "busy urban activity",
556
+
557
+ # 金融區相關
558
+ "buildings": "tall buildings",
559
+ "traffic_elements": "vehicles",
560
+ "skyscrapers": "high-rise buildings",
561
+ "road_features": "wide streets",
562
+ "architectural_elements": "modern architecture",
563
+ "city_landmarks": "prominent structures",
564
+
565
+ # 十字路口相關
566
+ "crossing_pattern": "marked pedestrian crossings",
567
+ "pedestrian_behavior": "careful walking",
568
+ "pedestrian_density": "groups of pedestrians",
569
+ "traffic_pattern": "regulated traffic flow",
570
+
571
+ # 交通樞紐相關
572
+ "transit_vehicles": "public transportation vehicles",
573
+ "passenger_activity": "commuter movement",
574
+ "transportation_modes": "various transit options",
575
+ "passenger_needs": "waiting areas",
576
+ "transit_infrastructure": "transit facilities",
577
+ "passenger_movement": "commuter flow",
578
+
579
+ # 購物區相關
580
+ "retail_elements": "shops and displays",
581
+ "store_types": "various retail establishments",
582
+ "walkway_features": "pedestrian pathways",
583
+ "commercial_signage": "store signs",
584
+ "consumer_behavior": "shopping activities",
585
+
586
+ # 空中視角相關
587
+ "commercial_layout": "organized retail areas",
588
+ "pedestrian_pattern": "people movement patterns",
589
+ "gathering_features": "public gathering spaces",
590
+ "movement_pattern": "crowd flow patterns",
591
+ "urban_elements": "city infrastructure",
592
+ "public_activity": "social interaction",
593
+
594
+ # 文化特定元素
595
+ "stall_elements": "vendor booths",
596
+ "lighting_features": "decorative lights",
597
+ "food_elements": "food offerings",
598
+ "vendor_stalls": "market stalls",
599
+ "nighttime_activity": "evening commerce",
600
+ "cultural_lighting": "traditional lighting",
601
+ "night_market_sounds": "lively market sounds",
602
+ "evening_crowd_behavior": "nighttime social activity",
603
+ "architectural_elements": "cultural buildings",
604
+ "religious_structures": "sacred buildings",
605
+ "decorative_features": "ornamental designs",
606
+ "cultural_practices": "traditional activities",
607
+ "temple_architecture": "religious structures",
608
+ "sensory_elements": "atmospheric elements",
609
+ "visitor_activities": "cultural experiences",
610
+ "ritual_activities": "ceremonial practices",
611
+ "cultural_symbols": "meaningful symbols",
612
+ "architectural_style": "historical buildings",
613
+ "historic_elements": "traditional architecture",
614
+ "urban_design": "city planning elements",
615
+ "social_behaviors": "public interactions",
616
+ "european_features": "European architectural details",
617
+ "tourist_activities": "visitor activities",
618
+ "local_customs": "regional practices",
619
+
620
+ # 時間特定元素
621
+ "lighting_effects": "artificial lighting",
622
+ "shadow_patterns": "light and shadow",
623
+ "urban_features": "city elements",
624
+ "illuminated_elements": "lit structures",
625
+ "evening_activities": "nighttime activities",
626
+ "light_sources": "lighting points",
627
+ "lit_areas": "illuminated spaces",
628
+ "shadowed_zones": "darker areas",
629
+ "illuminated_signage": "bright signs",
630
+ "colorful_lighting": "multicolored lights",
631
+ "neon_elements": "neon signs",
632
+ "night_crowd_behavior": "evening social patterns",
633
+ "light_displays": "lighting installations",
634
+ "building_features": "architectural elements",
635
+ "nightlife_activities": "evening entertainment",
636
+ "lighting_modifier": "bright",
637
+
638
+ # 混合環境元素
639
+ "transitional_elements": "connecting features",
640
+ "indoor_features": "interior elements",
641
+ "outdoor_setting": "exterior spaces",
642
+ "interior_amenities": "inside comforts",
643
+ "exterior_features": "outside elements",
644
+ "inside_elements": "interior design",
645
+ "outside_spaces": "outdoor areas",
646
+ "dual_environment_benefits": "combined settings",
647
+ "passenger_activities": "waiting behaviors",
648
+ "transportation_types": "transit vehicles",
649
+ "sheltered_elements": "covered areas",
650
+ "exposed_areas": "open sections",
651
+ "waiting_behaviors": "passenger activities",
652
+ "indoor_facilities": "inside services",
653
+ "platform_features": "transit platform elements",
654
+ "transit_routines": "transportation procedures",
655
+
656
+ # 專門場所元素
657
+ "seating_arrangement": "spectator seating",
658
+ "playing_surface": "athletic field",
659
+ "sporting_activities": "sports events",
660
+ "spectator_facilities": "viewer accommodations",
661
+ "competition_space": "sports arena",
662
+ "sports_events": "athletic competitions",
663
+ "viewing_areas": "audience sections",
664
+ "field_elements": "field markings and equipment",
665
+ "game_activities": "competitive play",
666
+ "construction_equipment": "building machinery",
667
+ "building_materials": "construction supplies",
668
+ "construction_activities": "building work",
669
+ "work_elements": "construction tools",
670
+ "structural_components": "building structures",
671
+ "site_equipment": "construction gear",
672
+ "raw_materials": "building supplies",
673
+ "construction_process": "building phases",
674
+ "medical_elements": "healthcare equipment",
675
+ "clinical_activities": "medical procedures",
676
+ "facility_design": "healthcare layout",
677
+ "healthcare_features": "medical facilities",
678
+ "patient_interactions": "care activities",
679
+ "equipment_types": "medical devices",
680
+ "care_procedures": "health services",
681
+ "treatment_spaces": "clinical areas",
682
+ "educational_furniture": "learning furniture",
683
+ "learning_activities": "educational practices",
684
+ "instructional_design": "teaching layout",
685
+ "classroom_elements": "school equipment",
686
+ "teaching_methods": "educational approaches",
687
+ "student_engagement": "learning participation",
688
+ "learning_spaces": "educational areas",
689
+ "educational_tools": "teaching resources",
690
+ "knowledge_transfer": "learning exchanges"
691
+ }
692
+
693
+ # For each placeholder, try to fill with appropriate content
694
+ for placeholder in placeholders:
695
+ if placeholder in fillers:
696
+ # Get random filler for this placeholder
697
+ options = fillers[placeholder]
698
+ if options:
699
+ # Select 1-3 items from the options list
700
+ num_items = min(len(options), random.randint(1, 3))
701
+ selected_items = random.sample(options, num_items)
702
+
703
+ # Create a formatted list
704
+ if len(selected_items) == 1:
705
+ replacement = selected_items[0]
706
+ elif len(selected_items) == 2:
707
+ replacement = f"{selected_items[0]} and {selected_items[1]}"
708
+ else:
709
+ replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}"
710
+
711
+ # Replace the placeholder
712
+ filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
713
+ else:
714
+ # Try to fill with scene-specific logic
715
+ replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type)
716
+ if replacement:
717
+ filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
718
+ elif placeholder in default_replacements:
719
+ # Use default replacement if available
720
+ filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder])
721
+ else:
722
+ # Last resort default
723
+ filled_template = filled_template.replace(f"{{{placeholder}}}", "various items")
724
+
725
+ return filled_template
726
+
727
+ def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
728
+ """
729
+ Generate content for a template placeholder based on scene-specific logic.
730
+
731
+ Args:
732
+ placeholder: Template placeholder
733
+ detected_objects: List of detected objects
734
+ scene_type: Identified scene type
735
+
736
+ Returns:
737
+ str: Content for the placeholder
738
+ """
739
+ # Handle different types of placeholders with custom logic
740
+ if placeholder == "furniture":
741
+ # Extract furniture items
742
+ furniture_ids = [56, 57, 58, 59, 60, 61] # Example furniture IDs
743
+ furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids]
744
+
745
+ if furniture_objects:
746
+ furniture_names = [obj["class_name"] for obj in furniture_objects[:3]]
747
+ return ", ".join(set(furniture_names))
748
+ return "various furniture items"
749
+
750
+ elif placeholder == "electronics":
751
+ # Extract electronic items
752
+ electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # Example electronics IDs
753
+ electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids]
754
+
755
+ if electronics_objects:
756
+ electronics_names = [obj["class_name"] for obj in electronics_objects[:3]]
757
+ return ", ".join(set(electronics_names))
758
+ return "electronic devices"
759
+
760
+ elif placeholder == "people_count":
761
+ # Count people
762
+ people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
763
+
764
+ if people_count == 0:
765
+ return "no people"
766
+ elif people_count == 1:
767
+ return "one person"
768
+ elif people_count < 5:
769
+ return f"{people_count} people"
770
+ else:
771
+ return "several people"
772
+
773
+ elif placeholder == "seating":
774
+ # Extract seating items
775
+ seating_ids = [56, 57] # chair, sofa
776
+ seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids]
777
+
778
+ if seating_objects:
779
+ seating_names = [obj["class_name"] for obj in seating_objects[:2]]
780
+ return ", ".join(set(seating_names))
781
+ return "seating arrangements"
782
+
783
+ # Default case - empty string
784
+ return ""
785
+
786
+ def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
787
+ """
788
+ Generate basic details when templates aren't available.
789
+
790
+ Args:
791
+ scene_type: Identified scene type
792
+ detected_objects: List of detected objects
793
+
794
+ Returns:
795
+ str: Basic scene details
796
+ """
797
+ # Handle specific scene types with custom logic
798
+ if scene_type == "living_room":
799
+ tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62] # TV
800
+ sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57] # Sofa
801
+
802
+ if tv_objs and sofa_objs:
803
+ tv_region = tv_objs[0]["region"]
804
+ sofa_region = sofa_objs[0]["region"]
805
+
806
+ arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
807
+ arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "
808
+
809
+ return f"{arrangement}This appears to be a space designed for relaxation and entertainment."
810
+
811
+ elif scene_type == "bedroom":
812
+ bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed
813
+
814
+ if bed_objs:
815
+ bed_region = bed_objs[0]["region"]
816
+ extra_items = []
817
+
818
+ for obj in detected_objects:
819
+ if obj["class_id"] == 74: # Clock
820
+ extra_items.append("clock")
821
+ elif obj["class_id"] == 73: # Book
822
+ extra_items.append("book")
823
+
824
+ extras = ""
825
+ if extra_items:
826
+ extras = f" There is also a {' and a '.join(extra_items)} visible."
827
+
828
+ return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"
829
+
830
+ elif scene_type in ["dining_area", "kitchen"]:
831
+ # Count food and dining-related items
832
+ food_items = []
833
+ for obj in detected_objects:
834
+ if obj["class_id"] in [39, 41, 42, 43, 44, 45]: # Kitchen items
835
+ food_items.append(obj["class_name"])
836
+
837
+ food_str = ""
838
+ if food_items:
839
+ unique_items = list(set(food_items))
840
+ if len(unique_items) <= 3:
841
+ food_str = f" with {', '.join(unique_items)}"
842
+ else:
843
+ food_str = f" with {', '.join(unique_items[:3])} and other items"
844
+
845
+ return f"{food_str}."
846
+
847
+ elif scene_type == "city_street":
848
+ # Count people and vehicles
849
+ people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
850
+ vehicle_count = len([obj for obj in detected_objects
851
+ if obj["class_id"] in [1, 2, 3, 5, 7]]) # Bicycle, car, motorbike, bus, truck
852
+
853
+ traffic_desc = ""
854
+ if people_count > 0 and vehicle_count > 0:
855
+ traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
856
+ traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
857
+ elif people_count > 0:
858
+ traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
859
+ elif vehicle_count > 0:
860
+ traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
861
+
862
+ return f"{traffic_desc}."
863
+
864
+ # Handle more specialized scenes
865
+ elif scene_type == "asian_commercial_street":
866
+ # Look for key urban elements
867
+ people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
868
+ vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]])
869
+
870
+ # Analyze pedestrian distribution
871
+ people_positions = []
872
+ for obj in detected_objects:
873
+ if obj["class_id"] == 0: # Person
874
+ people_positions.append(obj["normalized_center"])
875
+
876
+ # Check if people are distributed along a line (indicating a walking path)
877
+ structured_path = False
878
+ if len(people_positions) >= 3:
879
+ # Simplified check - see if y-coordinates are similar for multiple people
880
+ y_coords = [pos[1] for pos in people_positions]
881
+ y_mean = sum(y_coords) / len(y_coords)
882
+ y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
883
+ if y_variance < 0.05: # Low variance indicates linear arrangement
884
+ structured_path = True
885
+
886
+ street_desc = "A commercial street with "
887
+ if people_count > 0:
888
+ street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
889
+ if vehicle_count > 0:
890
+ street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
891
+ elif vehicle_count > 0:
892
+ street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
893
+ else:
894
+ street_desc += "various commercial elements"
895
+
896
+ if structured_path:
897
+ street_desc += ". The pedestrians appear to be following a defined walking path"
898
+
899
+ # Add cultural elements
900
+ street_desc += ". The signage and architectural elements suggest an Asian urban setting."
901
+
902
+ return street_desc
903
+
904
+ # Default general description
905
+ return "The scene contains various elements characteristic of this environment."
906
+
907
+ def _detect_viewpoint(self, detected_objects: List[Dict]) -> str:
908
+ """
909
+ 改進視角檢測,特別加強對空中俯視視角的識別。
910
+
911
+ Args:
912
+ detected_objects: 檢測到的物體列表
913
+
914
+ Returns:
915
+ str: 檢測到的視角類型
916
+ """
917
+ if not detected_objects:
918
+ return "eye_level" # default
919
+
920
+ # 提取物體位置和大小
921
+ top_region_count = 0
922
+ bottom_region_count = 0
923
+ total_objects = len(detected_objects)
924
+
925
+ # 追蹤大小分布以檢測空中視角
926
+ sizes = []
927
+
928
+ # 垂直大小比例用於低角度檢測
929
+ height_width_ratios = []
930
+
931
+ # 用於檢測規則圖案的變數
932
+ people_positions = []
933
+ crosswalk_pattern_detected = False
934
+
935
+ for obj in detected_objects:
936
+ # 計算頂部/底部區域中的物體
937
+ region = obj["region"]
938
+ if "top" in region:
939
+ top_region_count += 1
940
+ elif "bottom" in region:
941
+ bottom_region_count += 1
942
+
943
+ # 計算標準化大小(面積)
944
+ if "normalized_area" in obj:
945
+ sizes.append(obj["normalized_area"])
946
+
947
+ # 計算高度/寬度比例
948
+ if "normalized_size" in obj:
949
+ width, height = obj["normalized_size"]
950
+ if width > 0:
951
+ height_width_ratios.append(height / width)
952
+
953
+ # 收集人的位置用於圖案檢測
954
+ if obj["class_id"] == 0: # 人
955
+ if "normalized_center" in obj:
956
+ people_positions.append(obj["normalized_center"])
957
+
958
+ # 專門為斑馬線十字路口添加檢測邏輯
959
+ # 檢查是否有明顯的垂直和水平行人分布
960
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
961
+
962
+ if len(people_objs) >= 8: # 需要足夠多的人才能形成十字路口模式
963
+ # 檢查是否有斑馬線模式 - 新增功能
964
+ if len(people_positions) >= 4:
965
+ # 對位置進行聚類分析,尋找線性分布
966
+ x_coords = [pos[0] for pos in people_positions]
967
+ y_coords = [pos[1] for pos in people_positions]
968
+
969
+ # 計算 x 和 y 坐標的變異數和範圍
970
+ x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
971
+ y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
972
+
973
+ x_range = max(x_coords) - min(x_coords)
974
+ y_range = max(y_coords) - min(y_coords)
975
+
976
+ # 嘗試檢測十字形分布
977
+ # 如果 x 和 y 方向都有較大範圍,且範圍相似,可能是十字路口
978
+ if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
979
+
980
+ # 計算到中心點的距離
981
+ center_x = np.mean(x_coords)
982
+ center_y = np.mean(y_coords)
983
+
984
+ # 將點映射到十字架的軸上(水平和垂直)
985
+ x_axis_distance = [abs(x - center_x) for x in x_coords]
986
+ y_axis_distance = [abs(y - center_y) for y in y_coords]
987
+
988
+ # 點應該接近軸線(水平或垂直)
989
+ # 對於每個點,檢查它是否接近水平或垂直軸線
990
+ close_to_axis_count = 0
991
+ for i in range(len(x_coords)):
992
+ if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1:
993
+ close_to_axis_count += 1
994
+
995
+ # 如果足夠多的點接近軸線,認為是十字路口
996
+ if close_to_axis_count >= len(x_coords) * 0.6:
997
+ crosswalk_pattern_detected = True
998
+
999
+ # 如果沒有檢測到十字形,嘗試檢測線性聚類分布
1000
+ if not crosswalk_pattern_detected:
1001
+ # 檢查 x 和 y 方向的聚類
1002
+ x_clusters = self._detect_linear_clusters(x_coords)
1003
+ y_clusters = self._detect_linear_clusters(y_coords)
1004
+
1005
+ # 如果在 x 和 y 方向上都有多個聚類,可能是交叉的斑馬線
1006
+ if len(x_clusters) >= 2 and len(y_clusters) >= 2:
1007
+ crosswalk_pattern_detected = True
1008
+
1009
+ # 檢測斑馬線模式 - 優先判斷
1010
+ if crosswalk_pattern_detected:
1011
+ return "aerial"
1012
+
1013
+ # 檢測行人分布情況
1014
+ if len(people_objs) >= 10:
1015
+ people_region_counts = {}
1016
+ for obj in people_objs:
1017
+ region = obj["region"]
1018
+ if region not in people_region_counts:
1019
+ people_region_counts[region] = 0
1020
+ people_region_counts[region] += 1
1021
+
1022
+ # 計算不同區域中的行人數量
1023
+ region_count = len([r for r, c in people_region_counts.items() if c >= 2])
1024
+
1025
+ # 如果行人分布在多個區域中,可能是空中視角
1026
+ if region_count >= 4:
1027
+ # 檢查行人分布的模式
1028
+ # 特別是檢查不同區域中行人數量的差異
1029
+ region_counts = list(people_region_counts.values())
1030
+ region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0
1031
+ region_counts_mean = np.mean(region_counts) if region_counts else 0
1032
+
1033
+ # 如果行人分布較為均勻(變異係數小),可能是空中視角
1034
+ if region_counts_mean > 0:
1035
+ variation_coefficient = region_counts_variance / region_counts_mean
1036
+ if variation_coefficient < 0.5:
1037
+ return "aerial"
1038
+
1039
+ # 計算指標
1040
+ top_ratio = top_region_count / total_objects if total_objects > 0 else 0
1041
+ bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0
1042
+
1043
+ # 大小變異數(標準化)
1044
+ size_variance = 0
1045
+ if sizes:
1046
+ mean_size = sum(sizes) / len(sizes)
1047
+ size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes)
1048
+ size_variance = size_variance / (mean_size ** 2) # 標準化
1049
+
1050
+ # 平均高度/寬度比例
1051
+ avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0
1052
+
1053
+ # 空中視角:低大小差異,物體均勻分布,底部很少或沒有物體
1054
+ if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and
1055
+ bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]):
1056
+ return "aerial"
1057
+
1058
+ # 低角度視角:物體傾向於比寬高,頂部較多物體
1059
+ elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and
1060
+ top_ratio > self.viewpoint_params["low_angle_threshold"]):
1061
+ return "low_angle"
1062
+
1063
+ # 高視角:底部較多物體,頂部較少
1064
+ elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and
1065
+ top_ratio < self.viewpoint_params["elevated_top_threshold"]):
1066
+ return "elevated"
1067
+
1068
+ # 默認:平視角
1069
+ return "eye_level"
1070
+
1071
+ def _detect_linear_clusters(self, coords, threshold=0.05):
1072
+ """
1073
+ 檢測坐標中的線性聚類
1074
+
1075
+ Args:
1076
+ coords: 一維坐標列表
1077
+ threshold: 聚類閾值
1078
+
1079
+ Returns:
1080
+ list: 聚類列表
1081
+ """
1082
+ if not coords:
1083
+ return []
1084
+
1085
+ # 排序坐標
1086
+ sorted_coords = sorted(coords)
1087
+
1088
+ clusters = []
1089
+ current_cluster = [sorted_coords[0]]
1090
+
1091
+ for i in range(1, len(sorted_coords)):
1092
+ # 如果當前坐標與前一個接近,添加到當前聚類
1093
+ if sorted_coords[i] - sorted_coords[i-1] < threshold:
1094
+ current_cluster.append(sorted_coords[i])
1095
+ else:
1096
+ # 否則開始新的聚類
1097
+ if len(current_cluster) >= 2: # 至少需要2個點形成聚類
1098
+ clusters.append(current_cluster)
1099
+ current_cluster = [sorted_coords[i]]
1100
+
1101
+ # 添加最後一個cluster
1102
+ if len(current_cluster) >= 2:
1103
+ clusters.append(current_cluster)
1104
+
1105
+ return clusters
1106
+
1107
+ def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
1108
+ """
1109
+ Detect the likely cultural context of the scene.
1110
+
1111
+ Args:
1112
+ scene_type: Identified scene type
1113
+ detected_objects: List of detected objects
1114
+
1115
+ Returns:
1116
+ Optional[str]: Detected cultural context (asian, european, etc.) or None
1117
+ """
1118
+ # Scene types with explicit cultural contexts
1119
+ cultural_scene_mapping = {
1120
+ "asian_commercial_street": "asian",
1121
+ "asian_night_market": "asian",
1122
+ "asian_temple_area": "asian",
1123
+ "european_plaza": "european"
1124
+ }
1125
+
1126
+ # Check if scene type directly indicates cultural context
1127
+ if scene_type in cultural_scene_mapping:
1128
+ return cultural_scene_mapping[scene_type]
1129
+
1130
+ # No specific cultural context detected
1131
+ return None
1132
+
1133
+ def _generate_cultural_elements(self, cultural_context: str) -> str:
1134
+ """
1135
+ Generate description of cultural elements for the detected context.
1136
+
1137
+ Args:
1138
+ cultural_context: Detected cultural context
1139
+
1140
+ Returns:
1141
+ str: Description of cultural elements
1142
+ """
1143
+ # Get template for this cultural context
1144
+ cultural_templates = self.templates.get("cultural_templates", {})
1145
+
1146
+ if cultural_context in cultural_templates:
1147
+ template = cultural_templates[cultural_context]
1148
+ elements = template.get("elements", [])
1149
+
1150
+ if elements:
1151
+ # Select 1-2 random elements
1152
+ num_elements = min(len(elements), random.randint(1, 2))
1153
+ selected_elements = random.sample(elements, num_elements)
1154
+
1155
+ # Format elements list
1156
+ elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0]
1157
+
1158
+ # Fill template
1159
+ return template.get("description", "").format(elements=elements_text)
1160
+
1161
+ return ""
1162
+
1163
+ def _optimize_object_description(self, description: str) -> str:
1164
+ """
1165
+ 優化物品描述,避免重複列舉相同物品
1166
+ """
1167
+ import re
1168
+
1169
+ # 處理床鋪重複描述
1170
+ if "bed in the room" in description:
1171
+ description = description.replace("a bed in the room", "a bed")
1172
+
1173
+ # 處理重複的物品列表
1174
+ # 尋找格式如 "item, item, item" 的模式
1175
+ object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
1176
+
1177
+ for obj_list in object_lists:
1178
+ # 計算每個物品出現次數
1179
+ items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
1180
+ item_counts = {}
1181
+
1182
+ for item in items:
1183
+ item = item.strip()
1184
+ if item and item not in ["and", "with"]:
1185
+ if item not in item_counts:
1186
+ item_counts[item] = 0
1187
+ item_counts[item] += 1
1188
+
1189
+ # 生成優化後的物品列表
1190
+ if item_counts:
1191
+ new_items = []
1192
+ for item, count in item_counts.items():
1193
+ if count > 1:
1194
+ new_items.append(f"{count} {item}s")
1195
+ else:
1196
+ new_items.append(item)
1197
+
1198
+ # 格式化新列表
1199
+ if len(new_items) == 1:
1200
+ new_list = new_items[0]
1201
+ elif len(new_items) == 2:
1202
+ new_list = f"{new_items[0]} and {new_items[1]}"
1203
+ else:
1204
+ new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
1205
+
1206
+ # 替換原始列表
1207
+ description = description.replace(obj_list, new_list)
1208
+
1209
+ return description
1210
+
1211
+ def _describe_functional_zones(self, functional_zones: Dict) -> str:
1212
+ """
1213
+ 生成場景功能區域的描述,優化處理行人區域、人數統計和物品重複問題。
1214
+
1215
+ Args:
1216
+ functional_zones: 識別出的功能區域字典
1217
+
1218
+ Returns:
1219
+ str: 功能區域描述
1220
+ """
1221
+ if not functional_zones:
1222
+ return ""
1223
+
1224
+ # 計算場景中的總人數
1225
+ total_people_count = 0
1226
+ people_by_zone = {}
1227
+
1228
+ # 計算每個區域的人數並累計總人數
1229
+ for zone_name, zone_info in functional_zones.items():
1230
+ if "objects" in zone_info:
1231
+ zone_people_count = zone_info["objects"].count("person")
1232
+ people_by_zone[zone_name] = zone_people_count
1233
+ total_people_count += zone_people_count
1234
+
1235
+ # 分類區域為行人區域和其他區域
1236
+ pedestrian_zones = []
1237
+ other_zones = []
1238
+
1239
+ for zone_name, zone_info in functional_zones.items():
1240
+ # 檢查是否是行人相關區域
1241
+ if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
1242
+ pedestrian_zones.append((zone_name, zone_info))
1243
+ else:
1244
+ other_zones.append((zone_name, zone_info))
1245
+
1246
+ # 獲取最重要的行人區域和其他區域
1247
+ main_pedestrian_zones = sorted(pedestrian_zones,
1248
+ key=lambda z: people_by_zone.get(z[0], 0),
1249
+ reverse=True)[:1] # 最多1個主要行人區域
1250
+
1251
+ top_other_zones = sorted(other_zones,
1252
+ key=lambda z: len(z[1].get("objects", [])),
1253
+ reverse=True)[:2] # 最多2個其他區域
1254
+
1255
+ # 合併區域
1256
+ top_zones = main_pedestrian_zones + top_other_zones
1257
+
1258
+ if not top_zones:
1259
+ return ""
1260
+
1261
+ # 生成匯總描述
1262
+ summary = ""
1263
+ max_mentioned_people = 0 # 跟踪已經提到的最大人數
1264
+
1265
+ # 如果總人數顯著且還沒在主描述中提到,添加總人數描述
1266
+ if total_people_count > 5:
1267
+ summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
1268
+ max_mentioned_people = total_people_count # 更新已提到的最大人數
1269
+
1270
+ # 處理每個區域的描述,確保人數信息的一致性
1271
+ processed_zones = []
1272
+
1273
+ for zone_name, zone_info in top_zones:
1274
+ zone_desc = zone_info.get("description", "a functional zone")
1275
+ zone_people_count = people_by_zone.get(zone_name, 0)
1276
+
1277
+ # 檢查描述中是否包含人數信息
1278
+ contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
1279
+
1280
+ # 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述
1281
+ if contains_people_info and zone_people_count < max_mentioned_people:
1282
+ parts = zone_desc.split("with")
1283
+ if len(parts) > 1:
1284
+ # 移除人數部分
1285
+ zone_desc = parts[0].strip() + " area"
1286
+
1287
+ processed_zones.append((zone_name, {"description": zone_desc}))
1288
+
1289
+ # 根據處理後的區域數量生成最終描述
1290
+ final_desc = ""
1291
+
1292
+ if len(processed_zones) == 1:
1293
+ _, zone_info = processed_zones[0]
1294
+ zone_desc = zone_info["description"]
1295
+ final_desc = summary + f"The scene includes {zone_desc}."
1296
+ elif len(processed_zones) == 2:
1297
+ _, zone1_info = processed_zones[0]
1298
+ _, zone2_info = processed_zones[1]
1299
+ zone1_desc = zone1_info["description"]
1300
+ zone2_desc = zone2_info["description"]
1301
+ final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
1302
+ else:
1303
+ zones_desc = ["The scene contains multiple functional areas including"]
1304
+ zone_descriptions = [z[1]["description"] for z in processed_zones]
1305
+
1306
+ # 格式化最終的多區域描述
1307
+ if len(zone_descriptions) == 3:
1308
+ formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
1309
+ else:
1310
+ formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"
1311
+
1312
+ final_desc = summary + f"{zones_desc[0]} {formatted_desc}."
1313
+
1314
+ return self._optimize_object_description(final_desc)
image_processor.py CHANGED
@@ -11,64 +11,125 @@ from detection_model import DetectionModel
11
  from color_mapper import ColorMapper
12
  from visualization_helper import VisualizationHelper
13
  from evaluation_metrics import EvaluationMetrics
 
 
14
 
15
  class ImageProcessor:
16
  """
17
  Class for handling image processing and object detection operations
18
  Separates processing logic from UI components
19
  """
20
-
21
  def __init__(self):
22
  """Initialize the image processor with required components"""
23
  self.color_mapper = ColorMapper()
24
  self.model_instances = {}
25
-
26
- def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.35) -> DetectionModel:
 
27
  """
28
  Get or create a model instance based on model name
29
-
30
  Args:
31
  model_name: Name of the model to use
32
  confidence: Confidence threshold for detection
33
  iou: IoU threshold for non-maximum suppression
34
-
35
  Returns:
36
  DetectionModel instance
37
  """
38
  if model_name not in self.model_instances:
39
  print(f"Creating new model instance for {model_name}")
40
  self.model_instances[model_name] = DetectionModel(
41
- model_name=model_name,
42
- confidence=confidence,
43
  iou=iou
44
  )
45
  else:
46
  print(f"Using existing model instance for {model_name}")
47
  self.model_instances[model_name].confidence = confidence
48
-
49
  return self.model_instances[model_name]
50
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
52
  """
53
  Process an image for object detection
54
-
55
  Args:
56
  image: Input image (numpy array or PIL Image)
57
  model_name: Name of the model to use
58
  confidence_threshold: Confidence threshold for detection
59
  filter_classes: Optional list of classes to filter results
60
-
61
  Returns:
62
  Tuple of (result_image, result_text, stats_data)
63
  """
64
  # Get model instance
65
  model_instance = self.get_model_instance(model_name, confidence_threshold)
66
-
67
  # Initialize key variables
68
  result = None
69
  stats = {}
70
  temp_path = None
71
-
72
  try:
73
  # Processing input image
74
  if isinstance(image, np.ndarray):
@@ -82,44 +143,51 @@ class ImageProcessor:
82
  return None, "No image provided. Please upload an image.", {}
83
  else:
84
  pil_image = image
85
-
 
 
 
86
  # Store temp files
87
  temp_dir = tempfile.gettempdir() # Use system temp directory
88
  temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
89
  temp_path = os.path.join(temp_dir, temp_filename)
90
  pil_image.save(temp_path)
91
-
92
  # Object detection
93
  result = model_instance.detect(temp_path)
94
-
95
  if result is None:
96
  return None, "Detection failed. Please try again with a different image.", {}
97
-
98
  # Calculate stats
99
  stats = EvaluationMetrics.calculate_basic_stats(result)
100
-
101
  # Add space calculation
102
  spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
103
  stats["spatial_metrics"] = spatial_metrics
104
-
 
 
 
105
  # Apply filter if specified
106
  if filter_classes and len(filter_classes) > 0:
107
  # Get classes, boxes, confidence
108
  classes = result.boxes.cls.cpu().numpy().astype(int)
109
  confs = result.boxes.conf.cpu().numpy()
110
  boxes = result.boxes.xyxy.cpu().numpy()
111
-
112
  mask = np.zeros_like(classes, dtype=bool)
113
  for cls_id in filter_classes:
114
  mask = np.logical_or(mask, classes == cls_id)
115
-
116
  filtered_stats = {
117
  "total_objects": int(np.sum(mask)),
118
  "class_statistics": {},
119
  "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
120
- "spatial_metrics": stats["spatial_metrics"]
 
121
  }
122
-
123
  # Update stats
124
  names = result.names
125
  for cls, conf in zip(classes[mask], confs[mask]):
@@ -129,59 +197,67 @@ class ImageProcessor:
129
  "count": 0,
130
  "average_confidence": 0
131
  }
132
-
133
  filtered_stats["class_statistics"][cls_name]["count"] += 1
134
  filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
135
-
136
  stats = filtered_stats
137
-
138
  viz_data = EvaluationMetrics.generate_visualization_data(
139
  result,
140
  self.color_mapper.get_all_colors()
141
  )
142
-
143
  result_image = VisualizationHelper.visualize_detection(
144
  temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
145
  )
146
-
147
  result_text = EvaluationMetrics.format_detection_summary(viz_data)
148
-
 
 
 
 
 
 
 
149
  return result_image, result_text, stats
150
-
151
  except Exception as e:
152
  error_message = f"Error Occurs: {str(e)}"
153
  import traceback
154
  traceback.print_exc()
155
  print(error_message)
156
  return None, error_message, {}
157
-
158
  finally:
159
  if temp_path and os.path.exists(temp_path):
160
  try:
161
  os.remove(temp_path)
162
  except Exception as e:
163
  print(f"Cannot delete temp files {temp_path}: {str(e)}")
164
-
 
165
  def format_result_text(self, stats: Dict) -> str:
166
  """
167
  Format detection statistics into readable text with improved spacing
168
-
169
  Args:
170
  stats: Dictionary containing detection statistics
171
-
172
  Returns:
173
  Formatted text summary
174
  """
175
  if not stats or "total_objects" not in stats:
176
  return "No objects detected."
177
-
178
  # 減少不必要的空行
179
  lines = [
180
  f"Detected {stats['total_objects']} objects.",
181
  f"Average confidence: {stats.get('average_confidence', 0):.2f}",
182
  "Objects by class:"
183
  ]
184
-
185
  if "class_statistics" in stats and stats["class_statistics"]:
186
  # 按計數排序類別
187
  sorted_classes = sorted(
@@ -189,24 +265,24 @@ class ImageProcessor:
189
  key=lambda x: x[1]["count"],
190
  reverse=True
191
  )
192
-
193
  for cls_name, cls_stats in sorted_classes:
194
  count = cls_stats["count"]
195
  conf = cls_stats.get("average_confidence", 0)
196
-
197
  item_text = "item" if count == 1 else "items"
198
  lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
199
  else:
200
  lines.append("No class information available.")
201
-
202
  # 添加空間信息
203
  if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
204
  lines.append("Object Distribution:")
205
-
206
  dist = stats["spatial_metrics"]["spatial_distribution"]
207
  x_mean = dist.get("x_mean", 0)
208
  y_mean = dist.get("y_mean", 0)
209
-
210
  # 描述物體的大致位置
211
  if x_mean < 0.33:
212
  h_pos = "on the left side"
@@ -214,37 +290,37 @@ class ImageProcessor:
214
  h_pos = "in the center"
215
  else:
216
  h_pos = "on the right side"
217
-
218
  if y_mean < 0.33:
219
  v_pos = "in the upper part"
220
  elif y_mean < 0.67:
221
  v_pos = "in the middle"
222
  else:
223
  v_pos = "in the lower part"
224
-
225
  lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")
226
-
227
  return "\n".join(lines)
228
-
229
  def format_json_for_display(self, stats: Dict) -> Dict:
230
  """
231
  Format statistics JSON for better display
232
-
233
  Args:
234
  stats: Raw statistics dictionary
235
-
236
  Returns:
237
  Formatted statistics structure for display
238
  """
239
  # Create a cleaner copy of the stats for display
240
  display_stats = {}
241
-
242
  # Add summary section
243
  display_stats["summary"] = {
244
  "total_objects": stats.get("total_objects", 0),
245
  "average_confidence": round(stats.get("average_confidence", 0), 3)
246
  }
247
-
248
  # Add class statistics in a more organized way
249
  if "class_statistics" in stats and stats["class_statistics"]:
250
  # Sort classes by count (descending)
@@ -253,20 +329,20 @@ class ImageProcessor:
253
  key=lambda x: x[1].get("count", 0),
254
  reverse=True
255
  )
256
-
257
  class_stats = {}
258
  for cls_name, cls_data in sorted_classes:
259
  class_stats[cls_name] = {
260
  "count": cls_data.get("count", 0),
261
  "average_confidence": round(cls_data.get("average_confidence", 0), 3)
262
  }
263
-
264
  display_stats["detected_objects"] = class_stats
265
-
266
  # Simplify spatial metrics
267
  if "spatial_metrics" in stats:
268
  spatial = stats["spatial_metrics"]
269
-
270
  # Simplify spatial distribution
271
  if "spatial_distribution" in spatial:
272
  dist = spatial["spatial_distribution"]
@@ -278,7 +354,7 @@ class ImageProcessor:
278
  "y_std": round(dist.get("y_std", 0), 3)
279
  }
280
  }
281
-
282
  # Add simplified size information
283
  if "size_distribution" in spatial:
284
  size = spatial["size_distribution"]
@@ -287,30 +363,30 @@ class ImageProcessor:
287
  "min_area": round(size.get("min_area", 0), 3),
288
  "max_area": round(size.get("max_area", 0), 3)
289
  }
290
-
291
  return display_stats
292
-
293
  def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
294
  """
295
  Prepare data for visualization based on detection statistics
296
-
297
  Args:
298
  stats: Detection statistics
299
  available_classes: Dictionary of available class IDs and names
300
-
301
  Returns:
302
  Visualization data dictionary
303
  """
304
  if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
305
  return {"error": "No detection data available"}
306
-
307
  # Prepare visualization data
308
  viz_data = {
309
  "total_objects": stats.get("total_objects", 0),
310
  "average_confidence": stats.get("average_confidence", 0),
311
  "class_data": []
312
  }
313
-
314
  # Class data
315
  for cls_name, cls_stats in stats.get("class_statistics", {}).items():
316
  # Search class ID
@@ -319,7 +395,7 @@ class ImageProcessor:
319
  if name == cls_name:
320
  class_id = id
321
  break
322
-
323
  cls_data = {
324
  "name": cls_name,
325
  "class_id": class_id,
@@ -327,10 +403,10 @@ class ImageProcessor:
327
  "average_confidence": cls_stats.get("average_confidence", 0),
328
  "color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
329
  }
330
-
331
  viz_data["class_data"].append(cls_data)
332
-
333
  # Descending order
334
  viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)
335
-
336
  return viz_data
 
11
  from color_mapper import ColorMapper
12
  from visualization_helper import VisualizationHelper
13
  from evaluation_metrics import EvaluationMetrics
14
+ from lighting_analyzer import LightingAnalyzer
15
+ from scene_analyzer import SceneAnalyzer
16
 
17
  class ImageProcessor:
18
  """
19
  Class for handling image processing and object detection operations
20
  Separates processing logic from UI components
21
  """
22
+
23
  def __init__(self):
24
  """Initialize the image processor with required components"""
25
  self.color_mapper = ColorMapper()
26
  self.model_instances = {}
27
+ self.lighting_analyzer = LightingAnalyzer()
28
+
29
+ def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
30
  """
31
  Get or create a model instance based on model name
32
+
33
  Args:
34
  model_name: Name of the model to use
35
  confidence: Confidence threshold for detection
36
  iou: IoU threshold for non-maximum suppression
37
+
38
  Returns:
39
  DetectionModel instance
40
  """
41
  if model_name not in self.model_instances:
42
  print(f"Creating new model instance for {model_name}")
43
  self.model_instances[model_name] = DetectionModel(
44
+ model_name=model_name,
45
+ confidence=confidence,
46
  iou=iou
47
  )
48
  else:
49
  print(f"Using existing model instance for {model_name}")
50
  self.model_instances[model_name].confidence = confidence
51
+
52
  return self.model_instances[model_name]
53
+
54
+ def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None) -> Dict:
55
+ """
56
+ Perform scene analysis on detection results
57
+
58
+ Args:
59
+ detection_result: Object detection result from YOLOv8
60
+ lighting_info: Lighting condition analysis results (optional)
61
+
62
+ Returns:
63
+ Dictionary containing scene analysis results
64
+ """
65
+ try:
66
+ # Initialize scene analyzer if not already done
67
+ if not hasattr(self, 'scene_analyzer'):
68
+ self.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
69
+
70
+ # 確保類名正確更新
71
+ if self.scene_analyzer.class_names is None:
72
+ self.scene_analyzer.class_names = detection_result.names
73
+ self.scene_analyzer.spatial_analyzer.class_names = detection_result.names
74
+
75
+ # Perform scene analysis with lighting info
76
+ scene_analysis = self.scene_analyzer.analyze(
77
+ detection_result=detection_result,
78
+ lighting_info=lighting_info,
79
+ class_confidence_threshold=0.35,
80
+ scene_confidence_threshold=0.6
81
+ )
82
+
83
+ return scene_analysis
84
+ except Exception as e:
85
+ print(f"Error in scene analysis: {str(e)}")
86
+ import traceback
87
+ traceback.print_exc()
88
+ return {
89
+ "scene_type": "unknown",
90
+ "confidence": 0.0,
91
+ "description": f"Error during scene analysis: {str(e)}",
92
+ "objects_present": [],
93
+ "object_count": 0,
94
+ "regions": {},
95
+ "possible_activities": [],
96
+ "safety_concerns": [],
97
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
98
+ }
99
+
100
+ def analyze_lighting_conditions(self, image):
101
+ """
102
+ 分析光照條件。
103
+
104
+ Args:
105
+ image: 輸入圖像
106
+
107
+ Returns:
108
+ Dict: 光照分析結果
109
+ """
110
+ return self.lighting_analyzer.analyze(image)
111
+
112
  def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
113
  """
114
  Process an image for object detection
115
+
116
  Args:
117
  image: Input image (numpy array or PIL Image)
118
  model_name: Name of the model to use
119
  confidence_threshold: Confidence threshold for detection
120
  filter_classes: Optional list of classes to filter results
121
+
122
  Returns:
123
  Tuple of (result_image, result_text, stats_data)
124
  """
125
  # Get model instance
126
  model_instance = self.get_model_instance(model_name, confidence_threshold)
127
+
128
  # Initialize key variables
129
  result = None
130
  stats = {}
131
  temp_path = None
132
+
133
  try:
134
  # Processing input image
135
  if isinstance(image, np.ndarray):
 
143
  return None, "No image provided. Please upload an image.", {}
144
  else:
145
  pil_image = image
146
+
147
+ # Analyze lighting conditions
148
+ lighting_info = self.analyze_lighting_conditions(pil_image)
149
+
150
  # Store temp files
151
  temp_dir = tempfile.gettempdir() # Use system temp directory
152
  temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
153
  temp_path = os.path.join(temp_dir, temp_filename)
154
  pil_image.save(temp_path)
155
+
156
  # Object detection
157
  result = model_instance.detect(temp_path)
158
+
159
  if result is None:
160
  return None, "Detection failed. Please try again with a different image.", {}
161
+
162
  # Calculate stats
163
  stats = EvaluationMetrics.calculate_basic_stats(result)
164
+
165
  # Add space calculation
166
  spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
167
  stats["spatial_metrics"] = spatial_metrics
168
+
169
+ # Add lighting information
170
+ stats["lighting_conditions"] = lighting_info
171
+
172
  # Apply filter if specified
173
  if filter_classes and len(filter_classes) > 0:
174
  # Get classes, boxes, confidence
175
  classes = result.boxes.cls.cpu().numpy().astype(int)
176
  confs = result.boxes.conf.cpu().numpy()
177
  boxes = result.boxes.xyxy.cpu().numpy()
178
+
179
  mask = np.zeros_like(classes, dtype=bool)
180
  for cls_id in filter_classes:
181
  mask = np.logical_or(mask, classes == cls_id)
182
+
183
  filtered_stats = {
184
  "total_objects": int(np.sum(mask)),
185
  "class_statistics": {},
186
  "average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
187
+ "spatial_metrics": stats["spatial_metrics"],
188
+ "lighting_conditions": lighting_info
189
  }
190
+
191
  # Update stats
192
  names = result.names
193
  for cls, conf in zip(classes[mask], confs[mask]):
 
197
  "count": 0,
198
  "average_confidence": 0
199
  }
200
+
201
  filtered_stats["class_statistics"][cls_name]["count"] += 1
202
  filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
203
+
204
  stats = filtered_stats
205
+
206
  viz_data = EvaluationMetrics.generate_visualization_data(
207
  result,
208
  self.color_mapper.get_all_colors()
209
  )
210
+
211
  result_image = VisualizationHelper.visualize_detection(
212
  temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
213
  )
214
+
215
  result_text = EvaluationMetrics.format_detection_summary(viz_data)
216
+
217
+ if result is not None:
218
+ # Perform scene analysis with lighting info
219
+ scene_analysis = self.analyze_scene(result, lighting_info)
220
+
221
+ # Add scene analysis to stats
222
+ stats["scene_analysis"] = scene_analysis
223
+
224
  return result_image, result_text, stats
225
+
226
  except Exception as e:
227
  error_message = f"Error Occurs: {str(e)}"
228
  import traceback
229
  traceback.print_exc()
230
  print(error_message)
231
  return None, error_message, {}
232
+
233
  finally:
234
  if temp_path and os.path.exists(temp_path):
235
  try:
236
  os.remove(temp_path)
237
  except Exception as e:
238
  print(f"Cannot delete temp files {temp_path}: {str(e)}")
239
+
240
+
241
  def format_result_text(self, stats: Dict) -> str:
242
  """
243
  Format detection statistics into readable text with improved spacing
244
+
245
  Args:
246
  stats: Dictionary containing detection statistics
247
+
248
  Returns:
249
  Formatted text summary
250
  """
251
  if not stats or "total_objects" not in stats:
252
  return "No objects detected."
253
+
254
  # 減少不必要的空行
255
  lines = [
256
  f"Detected {stats['total_objects']} objects.",
257
  f"Average confidence: {stats.get('average_confidence', 0):.2f}",
258
  "Objects by class:"
259
  ]
260
+
261
  if "class_statistics" in stats and stats["class_statistics"]:
262
  # 按計數排序類別
263
  sorted_classes = sorted(
 
265
  key=lambda x: x[1]["count"],
266
  reverse=True
267
  )
268
+
269
  for cls_name, cls_stats in sorted_classes:
270
  count = cls_stats["count"]
271
  conf = cls_stats.get("average_confidence", 0)
272
+
273
  item_text = "item" if count == 1 else "items"
274
  lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
275
  else:
276
  lines.append("No class information available.")
277
+
278
  # 添加空間信息
279
  if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
280
  lines.append("Object Distribution:")
281
+
282
  dist = stats["spatial_metrics"]["spatial_distribution"]
283
  x_mean = dist.get("x_mean", 0)
284
  y_mean = dist.get("y_mean", 0)
285
+
286
  # 描述物體的大致位置
287
  if x_mean < 0.33:
288
  h_pos = "on the left side"
 
290
  h_pos = "in the center"
291
  else:
292
  h_pos = "on the right side"
293
+
294
  if y_mean < 0.33:
295
  v_pos = "in the upper part"
296
  elif y_mean < 0.67:
297
  v_pos = "in the middle"
298
  else:
299
  v_pos = "in the lower part"
300
+
301
  lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")
302
+
303
  return "\n".join(lines)
304
+
305
  def format_json_for_display(self, stats: Dict) -> Dict:
306
  """
307
  Format statistics JSON for better display
308
+
309
  Args:
310
  stats: Raw statistics dictionary
311
+
312
  Returns:
313
  Formatted statistics structure for display
314
  """
315
  # Create a cleaner copy of the stats for display
316
  display_stats = {}
317
+
318
  # Add summary section
319
  display_stats["summary"] = {
320
  "total_objects": stats.get("total_objects", 0),
321
  "average_confidence": round(stats.get("average_confidence", 0), 3)
322
  }
323
+
324
  # Add class statistics in a more organized way
325
  if "class_statistics" in stats and stats["class_statistics"]:
326
  # Sort classes by count (descending)
 
329
  key=lambda x: x[1].get("count", 0),
330
  reverse=True
331
  )
332
+
333
  class_stats = {}
334
  for cls_name, cls_data in sorted_classes:
335
  class_stats[cls_name] = {
336
  "count": cls_data.get("count", 0),
337
  "average_confidence": round(cls_data.get("average_confidence", 0), 3)
338
  }
339
+
340
  display_stats["detected_objects"] = class_stats
341
+
342
  # Simplify spatial metrics
343
  if "spatial_metrics" in stats:
344
  spatial = stats["spatial_metrics"]
345
+
346
  # Simplify spatial distribution
347
  if "spatial_distribution" in spatial:
348
  dist = spatial["spatial_distribution"]
 
354
  "y_std": round(dist.get("y_std", 0), 3)
355
  }
356
  }
357
+
358
  # Add simplified size information
359
  if "size_distribution" in spatial:
360
  size = spatial["size_distribution"]
 
363
  "min_area": round(size.get("min_area", 0), 3),
364
  "max_area": round(size.get("max_area", 0), 3)
365
  }
366
+
367
  return display_stats
368
+
369
  def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
370
  """
371
  Prepare data for visualization based on detection statistics
372
+
373
  Args:
374
  stats: Detection statistics
375
  available_classes: Dictionary of available class IDs and names
376
+
377
  Returns:
378
  Visualization data dictionary
379
  """
380
  if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
381
  return {"error": "No detection data available"}
382
+
383
  # Prepare visualization data
384
  viz_data = {
385
  "total_objects": stats.get("total_objects", 0),
386
  "average_confidence": stats.get("average_confidence", 0),
387
  "class_data": []
388
  }
389
+
390
  # Class data
391
  for cls_name, cls_stats in stats.get("class_statistics", {}).items():
392
  # Search class ID
 
395
  if name == cls_name:
396
  class_id = id
397
  break
398
+
399
  cls_data = {
400
  "name": cls_name,
401
  "class_id": class_id,
 
403
  "average_confidence": cls_stats.get("average_confidence", 0),
404
  "color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
405
  }
406
+
407
  viz_data["class_data"].append(cls_data)
408
+
409
  # Descending order
410
  viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)
411
+
412
  return viz_data
lighting_analyzer.py ADDED
@@ -0,0 +1,811 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ from typing import Dict, Any, Optional
4
+
5
+ class LightingAnalyzer:
6
+ """
7
+ 分析圖像的光照條件,提供增強的室內or室外判斷和光照類型分類,並專注於光照分析。
8
+ """
9
+
10
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
11
+ """
12
+ 初始化光照分析器。
13
+
14
+ Args:
15
+ config: 可選的配置字典,用於自定義分析參數
16
+ """
17
+ self.config = config or self._get_default_config()
18
+
19
+ def analyze(self, image):
20
+ """
21
+ 分析圖像的光照條件。
22
+
23
+ 主要分析入口點,計算基本特徵,判斷室內/室外,確定光照條件。
24
+
25
+ Args:
26
+ image: 輸入圖像 (numpy array 或 PIL Image)
27
+
28
+ Returns:
29
+ Dict: 包含光照分析結果的字典
30
+ """
31
+ try:
32
+ # 轉換圖像格式
33
+ if not isinstance(image, np.ndarray):
34
+ image_np = np.array(image)
35
+ else:
36
+ image_np = image.copy()
37
+
38
+ # 確保 RGB 格式
39
+ if image_np.shape[2] == 3 and isinstance(image_np, np.ndarray):
40
+ image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
41
+ else:
42
+ image_rgb = image_np
43
+
44
+ # 計算基本特徵
45
+ features = self._compute_basic_features(image_rgb)
46
+
47
+ # 分析室內or室外
48
+ indoor_result = self._analyze_indoor_outdoor(features)
49
+ is_indoor = indoor_result["is_indoor"]
50
+ indoor_probability = indoor_result["indoor_probability"]
51
+
52
+ # 確定光照條件
53
+ lighting_conditions = self._determine_lighting_conditions(features, is_indoor)
54
+
55
+ # 整合結果
56
+ result = {
57
+ "time_of_day": lighting_conditions["time_of_day"],
58
+ "confidence": float(lighting_conditions["confidence"]),
59
+ "is_indoor": is_indoor,
60
+ "indoor_probability": float(indoor_probability),
61
+ "brightness": {
62
+ "average": float(features["avg_brightness"]),
63
+ "std_dev": float(features["brightness_std"]),
64
+ "dark_ratio": float(features["dark_pixel_ratio"])
65
+ },
66
+ "color_info": {
67
+ "blue_ratio": float(features["blue_ratio"]),
68
+ "yellow_orange_ratio": float(features["yellow_orange_ratio"]),
69
+ "gray_ratio": float(features["gray_ratio"]),
70
+ "avg_saturation": float(features["avg_saturation"]),
71
+ "sky_brightness": float(features["sky_brightness"]),
72
+ "color_atmosphere": features["color_atmosphere"],
73
+ "warm_ratio": float(features["warm_ratio"]),
74
+ "cool_ratio": float(features["cool_ratio"])
75
+ }
76
+ }
77
+
78
+ # 添加診斷信息
79
+ if self.config["include_diagnostics"]:
80
+ result["diagnostics"] = {
81
+ "feature_contributions": indoor_result.get("feature_contributions", {}),
82
+ "lighting_diagnostics": lighting_conditions.get("diagnostics", {})
83
+ }
84
+
85
+ return result
86
+
87
+ except Exception as e:
88
+ print(f"Error in lighting analysis: {str(e)}")
89
+ import traceback
90
+ traceback.print_exc()
91
+ return {
92
+ "time_of_day": "unknown",
93
+ "confidence": 0,
94
+ "error": str(e)
95
+ }
96
+
97
+ def _compute_basic_features(self, image_rgb):
98
+ """
99
+ 計算圖像的基本光照特徵(徹底優化版本)。
100
+
101
+ Args:
102
+ image_rgb: RGB 格式的圖像 (numpy array)
103
+
104
+ Returns:
105
+ Dict: 包含計算出的特徵值
106
+ """
107
+ # 獲取圖像尺寸
108
+ height, width = image_rgb.shape[:2]
109
+
110
+ # 根據圖像大小自適應縮放因子
111
+ base_scale = 4
112
+ scale_factor = base_scale + min(8, max(0, int((height * width) / (1000 * 1000))))
113
+
114
+ # 創建縮小的圖像以加速處理
115
+ small_rgb = cv2.resize(image_rgb, (width//scale_factor, height//scale_factor))
116
+
117
+ # 一次性轉換所有顏色空間,避免重複計算
118
+ hsv_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2HSV)
119
+ gray_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
120
+ small_gray = cv2.resize(gray_img, (width//scale_factor, height//scale_factor))
121
+
122
+ # 分離HSV通道
123
+ h_channel = hsv_img[:,:,0]
124
+ s_channel = hsv_img[:,:,1]
125
+ v_channel = hsv_img[:,:,2]
126
+
127
+ # 基本亮度特徵
128
+ avg_brightness = np.mean(v_channel)
129
+ brightness_std = np.std(v_channel)
130
+ dark_pixel_ratio = np.sum(v_channel < 50) / (height * width)
131
+
132
+ # 顏色特徵
133
+ yellow_orange_mask = ((h_channel >= 15) & (h_channel <= 40))
134
+ yellow_orange_ratio = np.sum(yellow_orange_mask) / (height * width)
135
+
136
+ blue_mask = ((h_channel >= 90) & (h_channel <= 130))
137
+ blue_ratio = np.sum(blue_mask) / (height * width)
138
+
139
+ # 特別檢查圖像上部區域,尋找藍天特徵
140
+ upper_region_h = h_channel[:height//4, :]
141
+ upper_region_s = s_channel[:height//4, :]
142
+ upper_region_v = v_channel[:height//4, :]
143
+
144
+ # 藍天通常具有高飽和度的藍色
145
+ sky_blue_mask = ((upper_region_h >= 90) & (upper_region_h <= 130) &
146
+ (upper_region_s > 70) & (upper_region_v > 150))
147
+ sky_blue_ratio = np.sum(sky_blue_mask) / max(1, upper_region_h.size)
148
+
149
+ gray_mask = (s_channel < 50) & (v_channel > 100)
150
+ gray_ratio = np.sum(gray_mask) / (height * width)
151
+
152
+ avg_saturation = np.mean(s_channel)
153
+
154
+ # 天空亮度
155
+ upper_half = v_channel[:height//2, :]
156
+ sky_brightness = np.mean(upper_half)
157
+
158
+ # 色調分析
159
+ warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
160
+ warm_ratio = np.sum(warm_colors) / (height * width)
161
+
162
+ cool_colors = (h_channel >= 180) & (h_channel <= 270)
163
+ cool_ratio = np.sum(cool_colors) / (height * width)
164
+
165
+ # 確定色彩氛圍
166
+ if warm_ratio > 0.4:
167
+ color_atmosphere = "warm"
168
+ elif cool_ratio > 0.4:
169
+ color_atmosphere = "cool"
170
+ else:
171
+ color_atmosphere = "neutral"
172
+
173
+ # 只在縮小的圖像上計算梯度,大幅提高效能
174
+ gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
175
+ gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)
176
+
177
+ vertical_strength = np.mean(np.abs(gy))
178
+ horizontal_strength = np.mean(np.abs(gx))
179
+ gradient_ratio = vertical_strength / max(horizontal_strength, 1e-5)
180
+
181
+ # -- 亮度均勻性 --
182
+ brightness_uniformity = 1 - min(1, brightness_std / max(avg_brightness, 1e-5))
183
+
184
+ # -- 高效的天花板分析 --
185
+ # 使用更大的下採樣率分析頂部區域
186
+ top_scale = scale_factor * 2 # 更積極的下採樣
187
+ top_region = v_channel[:height//4:top_scale, ::top_scale]
188
+ top_region_std = np.std(top_region)
189
+ ceiling_uniformity = 1.0 - min(1.0, top_region_std / max(np.mean(top_region), 1e-5))
190
+
191
+ # 使用更簡單的方法檢測上部水平線
192
+ top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
193
+ horizontal_lines_strength = np.mean(top_gradients)
194
+ # 標準化
195
+ horizontal_line_ratio = min(1.0, horizontal_lines_strength / 40)
196
+
197
+ # 極簡的亮點檢測
198
+ sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
199
+ light_threshold = min(220, avg_brightness + 2*brightness_std)
200
+ is_bright = sampled_v > light_threshold
201
+ bright_spot_count = np.sum(is_bright)
202
+
203
+ # 圓形光源分析的簡化替代方法
204
+ circular_light_score = 0
205
+ indoor_light_score = 0
206
+ light_distribution_uniformity = 0.5
207
+
208
+ # 只有當檢測到亮點,且不是大量亮點時(可能是室外光反射)才進行光源分析
209
+ if 1 < bright_spot_count < 20:
210
+ # 簡單統計亮點分布
211
+ bright_y, bright_x = np.where(is_bright)
212
+ if len(bright_y) > 1:
213
+ # 檢查亮點是否成組出現 - 室內照明常見模式
214
+ mean_x = np.mean(bright_x)
215
+ mean_y = np.mean(bright_y)
216
+ dist_from_center = np.sqrt((bright_x - mean_x)**2 + (bright_y - mean_y)**2)
217
+
218
+ # 如果亮點分布較集中,可能是燈具
219
+ if np.std(dist_from_center) < np.mean(dist_from_center):
220
+ circular_light_score = min(3, len(bright_y) // 2)
221
+ light_distribution_uniformity = 0.7
222
+
223
+ # 評估亮點是否位於上部區域,常見於室內頂燈
224
+ if np.mean(bright_y) < sampled_v.shape[0] / 2:
225
+ indoor_light_score = 0.6
226
+ else:
227
+ indoor_light_score = 0.3
228
+
229
+ # 使用邊緣區域梯度來快速估計邊界
230
+ edge_scale = scale_factor * 2
231
+
232
+ # 只採樣圖像邊緣部分進行分析
233
+ left_edge = small_gray[:, :small_gray.shape[1]//6]
234
+ right_edge = small_gray[:, 5*small_gray.shape[1]//6:]
235
+ top_edge = small_gray[:small_gray.shape[0]//6, :]
236
+
237
+ # 計算每個邊緣區域的梯度強度
238
+ left_gradient = np.mean(np.abs(cv2.Sobel(left_edge, cv2.CV_32F, 1, 0, ksize=3)))
239
+ right_gradient = np.mean(np.abs(cv2.Sobel(right_edge, cv2.CV_32F, 1, 0, ksize=3)))
240
+ top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
241
+
242
+ # 標準化
243
+ left_edge_density = min(1.0, left_gradient / 50.0)
244
+ right_edge_density = min(1.0, right_gradient / 50.0)
245
+ top_edge_density = min(1.0, top_gradient / 50.0)
246
+
247
+ # 封閉環境通常在圖像邊緣有較強的梯度
248
+ boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
249
+
250
+ # 簡單估計���體邊緣密度
251
+ edges_density = min(1.0, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100.0)
252
+
253
+ street_line_score = 0
254
+
255
+ # 檢查下半部分是否有強烈的垂直線條
256
+ bottom_half = small_gray[small_gray.shape[0]//2:, :]
257
+ bottom_vert_gradient = cv2.Sobel(bottom_half, cv2.CV_32F, 0, 1, ksize=3)
258
+ strong_vert_lines = np.abs(bottom_vert_gradient) > 50
259
+ if np.sum(strong_vert_lines) > (bottom_half.size * 0.05): # 如果超過5%的像素是強垂直線
260
+ street_line_score = 0.7
261
+
262
+ # 整合所有特徵
263
+ features = {
264
+ # 基本亮度和顏色特徵
265
+ "avg_brightness": avg_brightness,
266
+ "brightness_std": brightness_std,
267
+ "dark_pixel_ratio": dark_pixel_ratio,
268
+ "yellow_orange_ratio": yellow_orange_ratio,
269
+ "blue_ratio": blue_ratio,
270
+ "sky_blue_ratio": sky_blue_ratio,
271
+ "gray_ratio": gray_ratio,
272
+ "avg_saturation": avg_saturation,
273
+ "sky_brightness": sky_brightness,
274
+ "color_atmosphere": color_atmosphere,
275
+ "warm_ratio": warm_ratio,
276
+ "cool_ratio": cool_ratio,
277
+
278
+ # 結構特徵
279
+ "gradient_ratio": gradient_ratio,
280
+ "brightness_uniformity": brightness_uniformity,
281
+ "bright_spot_count": bright_spot_count,
282
+ "vertical_strength": vertical_strength,
283
+ "horizontal_strength": horizontal_strength,
284
+
285
+ # 室內/室外判斷特徵
286
+ "ceiling_uniformity": ceiling_uniformity,
287
+ "horizontal_line_ratio": horizontal_line_ratio,
288
+ "indoor_light_score": indoor_light_score,
289
+ "circular_light_count": circular_light_score,
290
+ "light_distribution_uniformity": light_distribution_uniformity,
291
+ "boundary_edge_score": boundary_edge_score,
292
+ "top_region_std": top_region_std,
293
+ "edges_density": edges_density,
294
+
295
+ # 新增:室外特定特徵
296
+ "street_line_score": street_line_score
297
+ }
298
+
299
+ return features
300
+
301
+ def _analyze_indoor_outdoor(self, features):
302
+ """
303
+ 使用多特徵融合進行室內/室外判斷
304
+
305
+ Args:
306
+ features: 特徵字典
307
+
308
+ Returns:
309
+ Dict: 室內/室外判斷結果
310
+ """
311
+ # 獲取配置中的特徵權重
312
+ weights = self.config["indoor_outdoor_weights"]
313
+
314
+ # 初始概率值 - 開始時中性評估
315
+ indoor_score = 0
316
+ feature_contributions = {}
317
+ diagnostics = {}
318
+
319
+ # 1. 藍色區域(天空)特徵 - 藍色區域多通常表示室外
320
+ if features.get("blue_ratio", 0) > 0.2:
321
+ # 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
322
+ if (features.get("ceiling_uniformity", 0) > 0.5 or
323
+ features.get("boundary_edge_score", 0) > 0.3 or
324
+ features.get("indoor_light_score", 0) > 0.2 or
325
+ features.get("bright_spot_count", 0) > 0):
326
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
327
+ else:
328
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
329
+ else:
330
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
331
+
332
+ indoor_score += blue_score
333
+ feature_contributions["blue_ratio"] = blue_score
334
+
335
+ # 判斷視角 - 如果上部有藍天而上下亮度差異大,可能是仰視室外建築
336
+ if (features.get("sky_blue_ratio", 0) > 0.01 and
337
+ features["sky_brightness"] > features["avg_brightness"] * 1.1):
338
+ viewpoint_outdoor_score = -1.8 # 強烈的室外指標
339
+ indoor_score += viewpoint_outdoor_score
340
+ feature_contributions["outdoor_viewpoint"] = viewpoint_outdoor_score
341
+
342
+ # 2. 亮度均勻性特徵 - 室內通常光照更均勻
343
+ uniformity_score = weights["brightness_uniformity"] * features["brightness_uniformity"]
344
+ indoor_score += uniformity_score
345
+ feature_contributions["brightness_uniformity"] = uniformity_score
346
+
347
+ # 3. 天花板特徵 - 強化天花板檢測的權重
348
+ ceiling_contribution = 0
349
+ if "ceiling_uniformity" in features:
350
+ ceiling_uniformity = features["ceiling_uniformity"]
351
+ horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
352
+
353
+ # 增強天花板檢測的影響
354
+ if ceiling_uniformity > 0.5:
355
+ ceiling_weight = 3
356
+ ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
357
+ if horizontal_line_ratio > 0.2: # 如果有水平線條,進一步增強
358
+ ceiling_contribution *= 1.5
359
+ elif ceiling_uniformity > 0.4:
360
+ ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
361
+
362
+ indoor_score += ceiling_contribution
363
+ feature_contributions["ceiling_features"] = ceiling_contribution
364
+
365
+ # 4. 強化吊燈的檢測
366
+ light_contribution = 0
367
+ if "indoor_light_score" in features:
368
+ indoor_light_score = features["indoor_light_score"]
369
+ circular_light_count = features.get("circular_light_count", 0)
370
+
371
+ # 加強對特定類型光源的檢測
372
+ if circular_light_count >= 1: # 即便只有一個圓形光源也很可能是室內
373
+ light_contribution = weights.get("light_features", 1.2) * 2.0
374
+ elif indoor_light_score > 0.3:
375
+ light_contribution = weights.get("light_features", 1.2) * 1.0
376
+
377
+ indoor_score += light_contribution
378
+ feature_contributions["light_features"] = light_contribution
379
+
380
+ # 5. 環境封閉度特徵
381
+ boundary_contribution = 0
382
+ if "boundary_edge_score" in features:
383
+ boundary_edge_score = features["boundary_edge_score"]
384
+ edges_density = features.get("edges_density", 0)
385
+
386
+ # 高邊界評分暗示封閉環境(室內)
387
+ if boundary_edge_score > 0.3:
388
+ boundary_contribution = weights.get("boundary_features", 1.2) * 2
389
+ elif boundary_edge_score > 0.2:
390
+ boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
391
+
392
+ indoor_score += boundary_contribution
393
+ feature_contributions["boundary_features"] = boundary_contribution
394
+
395
+ if (features.get("edges_density", 0) > 0.2 and
396
+ features.get("bright_spot_count", 0) > 5 and
397
+ features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.5):
398
+ # 商業街道特徵:高邊緣密度 + 多亮點 + 強垂直特徵
399
+ street_feature_score = -weights.get("street_features", 1.2) * 1.5
400
+ indoor_score += street_feature_score
401
+ feature_contributions["street_features"] = street_feature_score
402
+
403
+ # 添加對亞洲商業街道的專門檢測
404
+ if (features.get("edges_density", 0) > 0.25 and # 高邊緣密度
405
+ features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.8 and # 更強的垂直結構
406
+ features.get("brightness_uniformity", 0) < 0.6): # 較低的亮度均勻性(招牌、燈光等造成)
407
+ asian_street_score = -2.2 # 非常強的室外代表性特徵
408
+ indoor_score += asian_street_score
409
+ feature_contributions["asian_commercial_street"] = asian_street_score
410
+
411
+
412
+ # 6. 垂直/水平梯度比率
413
+ gradient_contribution = 0
414
+ if features["gradient_ratio"] > 2.0:
415
+ combined_uniformity = (features["brightness_uniformity"] +
416
+ features.get("ceiling_uniformity", 0)) / 2
417
+
418
+ if combined_uniformity > 0.5:
419
+ gradient_contribution = weights["gradient_ratio"] * 0.7
420
+ else:
421
+ gradient_contribution = -weights["gradient_ratio"] * 0.3
422
+
423
+ indoor_score += gradient_contribution
424
+ feature_contributions["gradient_ratio"] = gradient_contribution
425
+
426
+ # 7. 亮點檢測(光源)
427
+ bright_spot_contribution = 0
428
+ bright_spot_count = features["bright_spot_count"]
429
+ circular_light_count = features.get("circular_light_count", 0)
430
+
431
+ # 調整亮點分析邏輯
432
+ if circular_light_count >= 1: # 即使只有一個圓形光源
433
+ bright_spot_contribution = weights["bright_spots"] * 1.5
434
+ elif bright_spot_count < 5: # 適當放寬閾值
435
+ bright_spot_contribution = weights["bright_spots"] * 0.5
436
+ elif bright_spot_count > 15: # 大量亮點比較有可能為室外
437
+ bright_spot_contribution = -weights["bright_spots"] * 0.4
438
+
439
+ indoor_score += bright_spot_contribution
440
+ feature_contributions["bright_spots"] = bright_spot_contribution
441
+
442
+ # 8. 色調分析
443
+ yellow_contribution = 0
444
+ if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
445
+ if features.get("indoor_light_score", 0) > 0.2:
446
+ yellow_contribution = weights["color_tone"] * 0.8
447
+ else:
448
+ yellow_contribution = weights["color_tone"] * 0.5
449
+
450
+ indoor_score += yellow_contribution
451
+ feature_contributions["yellow_tone"] = yellow_contribution
452
+
453
+ if features.get("blue_ratio", 0) > 0.7:
454
+ # 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
455
+ if (features.get("ceiling_uniformity", 0) > 0.6 or
456
+ features.get("boundary_edge_score", 0) > 0.3 or
457
+ features.get("indoor_light_score", 0) > 0):
458
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
459
+ else:
460
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
461
+ else:
462
+ blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
463
+ # 9. 上半部與下半部亮度對比
464
+ sky_contribution = 0
465
+ if features["sky_brightness"] > features["avg_brightness"] * 1.3:
466
+ if features["blue_ratio"] > 0.15:
467
+ sky_contribution = -weights["sky_brightness"] * 0.9
468
+ else:
469
+ sky_contribution = -weights["sky_brightness"] * 0.6
470
+
471
+ indoor_score += sky_contribution
472
+ feature_contributions["sky_brightness"] = sky_contribution
473
+
474
+ # 加入額外的餐廳特徵檢測邏輯
475
+ dining_feature_contribution = 0
476
+
477
+ # 檢測中央懸掛式燈具,有懸掛燈代表有天花板,就代表是室內
478
+ if circular_light_count >= 1 and features.get("light_distribution_uniformity", 0) > 0.4:
479
+ dining_feature_contribution = 1.5
480
+ indoor_score += dining_feature_contribution
481
+ feature_contributions["dining_features"] = dining_feature_contribution
482
+
483
+ # 10. 增強的藍天的檢測,即便是小面積的藍天也是很強的室外指標
484
+ sky_contribution = 0
485
+ if "sky_blue_ratio" in features:
486
+ # 只有當藍色區域集中在上部且亮度高時,才認為是藍天
487
+ if features["sky_blue_ratio"] > 0.01 and features["sky_brightness"] > features.get("avg_brightness", 0) * 1.2:
488
+ sky_outdoor_score = -2.5 * features["sky_blue_ratio"] * weights.get("blue_ratio", 1.2)
489
+ indoor_score += sky_outdoor_score
490
+ feature_contributions["sky_blue_detection"] = sky_outdoor_score
491
+
492
+ asian_street_indicators = 0
493
+
494
+ # 1: 高垂直結構強度
495
+ vertical_ratio = features.get("vertical_strength", 0) / max(features.get("horizontal_strength", 1e-5), 1e-5)
496
+ if vertical_ratio > 1.8:
497
+ asian_street_indicators += 1
498
+
499
+ # 2: 高邊緣密度 + 路面標記特徵
500
+ if features.get("edges_density", 0) > 0.25 and features.get("street_line_score", 0) > 0.2:
501
+ asian_street_indicators += 2
502
+
503
+ # 3: 多個亮點 + 亮度不均勻
504
+ if features.get("bright_spot_count", 0) > 5 and features.get("brightness_uniformity", 0) < 0.6:
505
+ asian_street_indicators += 1
506
+
507
+ # 4: 藍色區域小(天空被高樓遮擋)但亮度高
508
+ if features.get("blue_ratio", 0) < 0.1 and features.get("sky_brightness", 0) > features.get("avg_brightness", 0) * 1.1:
509
+ asian_street_indicators += 1
510
+
511
+ # 如果滿足至少 3 個指標,調整權重變成偏向室外的判斷
512
+ if asian_street_indicators >= 3:
513
+ # 記錄檢測到的模式
514
+ feature_contributions["asian_street_pattern"] = -2.5
515
+ indoor_score += -2.5 # 明顯向室外傾斜
516
+
517
+ # 降低室內指標的權重
518
+ if "boundary_features" in feature_contributions:
519
+ adjusted_contribution = feature_contributions["boundary_features"] * 0.4
520
+ indoor_score -= (feature_contributions["boundary_features"] - adjusted_contribution)
521
+ feature_contributions["boundary_features"] = adjusted_contribution
522
+
523
+ if "ceiling_features" in feature_contributions:
524
+ adjusted_contribution = feature_contributions["ceiling_features"] * 0.3
525
+ indoor_score -= (feature_contributions["ceiling_features"] - adjusted_contribution)
526
+ feature_contributions["ceiling_features"] = adjusted_contribution
527
+
528
+ # 添加信息到診斷數據
529
+ diagnostics["asian_street_detected"] = True
530
+ diagnostics["asian_street_indicators"] = asian_street_indicators
531
+
532
+ bedroom_indicators = 0
533
+
534
+ # 1: 窗戶和牆壁形成的直角
535
+ if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
536
+ bedroom_indicators += 1.5 # 增加權重
537
+
538
+ # 2: 天花板和光源
539
+ if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
540
+ bedroom_indicators += 2.5
541
+
542
+ # 3: 良好對比度的牆壁顏色,適合臥房還有客廳
543
+ if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
544
+ bedroom_indicators += 1.5
545
+
546
+ # 特殊的檢測 4: 檢測窗戶
547
+ if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
548
+ bedroom_indicators += 1.5
549
+
550
+ # 如果滿足足夠的家居指標,提高多點室內判斷分數
551
+ if bedroom_indicators >= 3:
552
+ # 增加家居環境評分
553
+ home_env_score = 3
554
+ indoor_score += home_env_score
555
+ feature_contributions["home_environment_pattern"] = home_env_score
556
+ elif bedroom_indicators >= 2:
557
+ # 適度增加家居環境評分
558
+ home_env_score = 2
559
+ indoor_score += home_env_score
560
+ feature_contributions["home_environment_pattern"] = home_env_score
561
+
562
+ # 根據總分轉換為���率(使用sigmoid函數)
563
+ indoor_probability = 1 / (1 + np.exp(-indoor_score * 0.22))
564
+
565
+ # 判斷結果
566
+ is_indoor = indoor_probability > 0.5
567
+
568
+ return {
569
+ "is_indoor": is_indoor,
570
+ "indoor_probability": indoor_probability,
571
+ "indoor_score": indoor_score,
572
+ "feature_contributions": feature_contributions,
573
+ "diagnostics": diagnostics
574
+ }
575
+
576
+ def _determine_lighting_conditions(self, features, is_indoor):
577
+ """
578
+ 基於特徵和室內/室外判斷確定光照條件。
579
+
580
+ Args:
581
+ features: 特徵字典
582
+ is_indoor: 是否是室內環境
583
+
584
+ Returns:
585
+ Dict: 光照條件分析結果
586
+ """
587
+ # 初始化
588
+ time_of_day = "unknown"
589
+ confidence = 0.5
590
+ diagnostics = {}
591
+
592
+ avg_brightness = features["avg_brightness"]
593
+ dark_pixel_ratio = features["dark_pixel_ratio"]
594
+ yellow_orange_ratio = features["yellow_orange_ratio"]
595
+ blue_ratio = features["blue_ratio"]
596
+ gray_ratio = features["gray_ratio"]
597
+
598
+ # 基於室內/室外分別判斷
599
+ if is_indoor:
600
+ # 計算室內住宅自然光指標
601
+ natural_window_light = 0
602
+
603
+ # 檢查窗戶特徵和光線特性
604
+ if (features.get("blue_ratio", 0) > 0.1 and
605
+ features.get("sky_brightness", 0) > avg_brightness * 1.1):
606
+ natural_window_light += 1
607
+
608
+ # 檢查均勻柔和的光線分布
609
+ if (features.get("brightness_uniformity", 0) > 0.65 and
610
+ features.get("brightness_std", 0) < 70):
611
+ natural_window_light += 1
612
+
613
+ # 檢查暖色調比例
614
+ if features.get("warm_ratio", 0) > 0.2:
615
+ natural_window_light += 1
616
+
617
+ # 家居環境指標
618
+ home_env_score = features.get("home_environment_pattern", 0)
619
+ if home_env_score > 1.5:
620
+ natural_window_light += 1
621
+
622
+ # 1. 室內明亮環境,可能有窗戶自然光
623
+ if avg_brightness > 130:
624
+ # 檢測自然光住宅空間 - 新增類型!
625
+ if natural_window_light >= 2 and home_env_score > 1.5:
626
+ time_of_day = "indoor_residential_natural" # 家裡的自然光類型
627
+ confidence = 0.8
628
+ diagnostics["reason"] = "Bright residential space with natural window lighting"
629
+ # 檢查窗戶特徵 - 如果有明亮的窗戶且色調為藍
630
+ elif features.get("blue_ratio", 0) > 0.1 and features.get("sky_brightness", 0) > 150:
631
+ time_of_day = "indoor_bright"
632
+ confidence = 0.8
633
+ diagnostics["reason"] = "Bright indoor scene with window light"
634
+ else:
635
+ time_of_day = "indoor_bright"
636
+ confidence = 0.75
637
+ diagnostics["reason"] = "High brightness in indoor environment"
638
+ # 2. 室內中等亮度環境
639
+ elif avg_brightness > 100:
640
+ time_of_day = "indoor_moderate"
641
+ confidence = 0.7
642
+ diagnostics["reason"] = "Moderate brightness in indoor environment"
643
+ # 3. 室內低光照環境
644
+ else:
645
+ time_of_day = "indoor_dim"
646
+ confidence = 0.65 + dark_pixel_ratio / 3
647
+ diagnostics["reason"] = "Low brightness in indoor environment"
648
+
649
+ # 1. 檢測設計師風格住宅,可以偵測到比較多種類的狀況
650
+ designer_residential_score = 0
651
+ # 檢測特色燈具
652
+ if (features.get("circular_light_count", 0) > 0 or features.get("bright_spot_count", 0) > 2):
653
+ designer_residential_score += 1
654
+ # 檢測高品質均勻照明
655
+ if features.get("brightness_uniformity", 0) > 0.7:
656
+ designer_residential_score += 1
657
+ # 檢測溫暖色調
658
+ if features.get("warm_ratio", 0) > 0.3:
659
+ designer_residential_score += 1
660
+ # 檢測家居環境特徵
661
+ if home_env_score > 1.5:
662
+ designer_residential_score += 1
663
+
664
+ if designer_residential_score >= 3 and home_env_score > 1.5:
665
+ time_of_day = "indoor_designer_residential"
666
+ confidence = 0.85
667
+ diagnostics["special_case"] = "Designer residential lighting with decorative elements"
668
+
669
+ # 2. 檢測餐廳/酒吧場景
670
+ elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
671
+ if features["warm_ratio"] > 0.4:
672
+ time_of_day = "indoor_restaurant"
673
+ confidence = 0.65 + yellow_orange_ratio / 4
674
+ diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
675
+
676
+ # 3. 檢測商業照明空間
677
+ elif avg_brightness > 120 and features["bright_spot_count"] > 4:
678
+ # 增加商業照明判別的精確度
679
+ commercial_score = 0
680
+ # 多個亮點
681
+ commercial_score += min(1.0, features["bright_spot_count"] * 0.05)
682
+ # 不太可能是住宅的指標
683
+ if features.get("home_environment_pattern", 0) < 1.5:
684
+ commercial_score += 0.5
685
+ # 整體照明結構化布局
686
+ if features.get("light_distribution_uniformity", 0) > 0.6:
687
+ commercial_score += 0.5
688
+
689
+ if commercial_score > 0.6 and designer_residential_score < 3:
690
+ time_of_day = "indoor_commercial"
691
+ confidence = 0.7 + commercial_score / 5
692
+ diagnostics["special_case"] = "Multiple structured light sources suggest commercial lighting"
693
+ else:
694
+ # 室外場景判斷保持不變
695
+ if avg_brightness < 90: # 降低夜間判斷的亮度閾值
696
+ # 檢測是否有車燈/街燈
697
+ has_lights = features["bright_spot_count"] > 3
698
+
699
+ if has_lights:
700
+ time_of_day = "night"
701
+ confidence = 0.8 + dark_pixel_ratio / 5
702
+ diagnostics["reason"] = "Low brightness with light sources detected"
703
+
704
+ # 檢查是否是霓虹燈場景
705
+ if yellow_orange_ratio > 0.15 and features["bright_spot_count"] > 5:
706
+ time_of_day = "neon_night"
707
+ confidence = 0.75 + yellow_orange_ratio / 3
708
+ diagnostics["special_case"] = "Multiple colorful light sources suggest neon lighting"
709
+ else:
710
+ time_of_day = "night"
711
+ confidence = 0.7 + dark_pixel_ratio / 3
712
+ diagnostics["reason"] = "Low brightness outdoor scene"
713
+ elif avg_brightness < 130 and yellow_orange_ratio > 0.2:
714
+ time_of_day = "sunset/sunrise"
715
+ confidence = 0.7 + yellow_orange_ratio / 3
716
+ diagnostics["reason"] = "Moderate brightness with yellow-orange tones"
717
+ elif avg_brightness > 150 and blue_ratio > 0.15:
718
+ time_of_day = "day_clear"
719
+ confidence = 0.7 + blue_ratio / 3
720
+ diagnostics["reason"] = "High brightness with blue tones (likely sky)"
721
+ elif avg_brightness > 130:
722
+ time_of_day = "day_cloudy"
723
+ confidence = 0.7 + gray_ratio / 3
724
+ diagnostics["reason"] = "Good brightness with higher gray tones"
725
+ else:
726
+ # 默認判斷
727
+ if yellow_orange_ratio > gray_ratio:
728
+ time_of_day = "sunset/sunrise"
729
+ confidence = 0.6 + yellow_orange_ratio / 3
730
+ diagnostics["reason"] = "Yellow-orange tones dominant"
731
+ else:
732
+ time_of_day = "day_cloudy"
733
+ confidence = 0.6 + gray_ratio / 3
734
+ diagnostics["reason"] = "Gray tones dominant"
735
+
736
+ # 檢查是否是特殊室外場景(如體育場)
737
+ if avg_brightness > 120 and features["brightness_uniformity"] > 0.8:
738
+ # 高亮度且非常均勻的光照可能是體育場燈光
739
+ time_of_day = "stadium_lighting"
740
+ confidence = 0.7
741
+ diagnostics["special_case"] = "Uniform bright lighting suggests stadium/sports lighting"
742
+
743
+ # 檢查是否是混合光照(如室內/室外過渡區)
744
+ if 100 < avg_brightness < 150 and 0.1 < blue_ratio < 0.2:
745
+ if features["gradient_ratio"] > 1.5:
746
+ time_of_day = "mixed_lighting"
747
+ confidence = 0.65
748
+ diagnostics["special_case"] = "Features suggest indoor-outdoor transition area"
749
+
750
+ # 確保信心值在 0-1 範圍內
751
+ confidence = min(0.95, max(0.5, confidence))
752
+
753
+ if time_of_day in ["indoor_residential_natural", "indoor_designer_residential"] and hasattr(self, "config"):
754
+ # 確保 LIGHTING_CONDITIONS 中有這些新類型的描述
755
+ if time_of_day == "indoor_residential_natural":
756
+ lightingType = {
757
+ "template_modifiers": {
758
+ "indoor_residential_natural": "naturally-lit residential"
759
+ },
760
+ "time_descriptions": {
761
+ "indoor_residential_natural": {
762
+ "general": "The scene is captured in a residential space with ample natural light from windows.",
763
+ "bright": "The residential space is brightly lit with natural daylight streaming through windows.",
764
+ "medium": "The home environment has good natural lighting providing a warm, inviting atmosphere.",
765
+ "dim": "The living space has soft natural light filtering through windows or openings."
766
+ }
767
+ }
768
+ }
769
+ elif time_of_day == "indoor_designer_residential":
770
+ lightingType = {
771
+ "template_modifiers": {
772
+ "indoor_designer_residential": "designer-lit residential"
773
+ },
774
+ "time_descriptions": {
775
+ "indoor_designer_residential": {
776
+ "general": "The scene is captured in a residential space with carefully designed lighting elements.",
777
+ "bright": "The home features professionally designed lighting with decorative fixtures creating a bright atmosphere.",
778
+ "medium": "The residential interior showcases curated lighting design balancing form and function.",
779
+ "dim": "The living space has thoughtfully placed designer lighting creating an intimate ambiance."
780
+ }
781
+ }
782
+ }
783
+
784
+ return {
785
+ "time_of_day": time_of_day,
786
+ "confidence": confidence,
787
+ "diagnostics": diagnostics
788
+ }
789
+
790
+
791
+ def _get_default_config(self):
792
+ """
793
+ 返回優化版本的默認配置參數。
794
+ """
795
+ return {
796
+ "indoor_outdoor_weights": {
797
+ "blue_ratio": 0.6,
798
+ "brightness_uniformity": 1.2,
799
+ "gradient_ratio": 0.7,
800
+ "bright_spots": 0.8,
801
+ "color_tone": 0.5,
802
+ "sky_brightness": 0.9,
803
+ "brightness_variation": 0.7,
804
+ "ceiling_features": 1.5,
805
+ "light_features": 1.1,
806
+ "boundary_features": 2.8,
807
+ "street_features": 2.0,
808
+ "building_features": 1.6
809
+ },
810
+ "include_diagnostics": True
811
+ }
lighting_conditions.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ LIGHTING_CONDITIONS = {
3
+ "time_descriptions": {
4
+ "day_clear": {
5
+ "general": "The scene is captured during clear daylight hours with bright natural lighting.",
6
+ "bright": "The scene is brightly lit with strong, clear daylight.",
7
+ "medium": "The scene is illuminated with moderate daylight under clear conditions.",
8
+ "dim": "The scene is captured in soft daylight on a clear day."
9
+ },
10
+ "day_cloudy": {
11
+ "general": "The scene is captured during daytime under overcast conditions.",
12
+ "bright": "The scene has the diffused bright lighting of an overcast day.",
13
+ "medium": "The scene has even, soft lighting typical of a cloudy day.",
14
+ "dim": "The scene has the muted lighting of a heavily overcast day."
15
+ },
16
+ "sunset/sunrise": {
17
+ "general": "The scene is captured during golden hour with warm lighting.",
18
+ "bright": "The scene is illuminated with bright golden hour light with long shadows.",
19
+ "medium": "The scene has the warm orange-yellow glow typical of sunset or sunrise.",
20
+ "dim": "The scene has soft, warm lighting characteristic of early sunrise or late sunset."
21
+ },
22
+ "night": {
23
+ "general": "The scene is captured at night with limited natural lighting.",
24
+ "bright": "The scene is captured at night but well-lit with artificial lighting.",
25
+ "medium": "The scene is captured at night with moderate artificial lighting.",
26
+ "dim": "The scene is captured in low-light night conditions with minimal illumination."
27
+ },
28
+ "indoor_bright": {
29
+ "general": "The scene is captured indoors with ample lighting.",
30
+ "bright": "The indoor space is brightly lit, possibly with natural light from windows.",
31
+ "medium": "The indoor space has good lighting conditions.",
32
+ "dim": "The indoor space has adequate lighting."
33
+ },
34
+ "indoor_moderate": {
35
+ "general": "The scene is captured indoors with moderate lighting.",
36
+ "bright": "The indoor space has comfortable, moderate lighting.",
37
+ "medium": "The indoor space has standard interior lighting.",
38
+ "dim": "The indoor space has somewhat subdued lighting."
39
+ },
40
+ "indoor_dim": {
41
+ "general": "The scene is captured indoors with dim or mood lighting.",
42
+ "bright": "The indoor space has dim but sufficient lighting.",
43
+ "medium": "The indoor space has low, atmospheric lighting.",
44
+ "dim": "The indoor space has very dim, possibly mood-oriented lighting."
45
+ },
46
+ "beach_daylight": {
47
+ "general": "The scene is captured during daytime at a beach with bright natural sunlight.",
48
+ "bright": "The beach scene is intensely illuminated by direct sunlight.",
49
+ "medium": "The coastal area has even natural daylight.",
50
+ "dim": "The beach has softer lighting, possibly from a partially cloudy sky."
51
+ },
52
+ "sports_arena": {
53
+ "general": "The scene is captured in a sports venue with specialized arena lighting.",
54
+ "bright": "The sports facility is brightly illuminated with powerful overhead lights.",
55
+ "medium": "The venue has standard sports event lighting providing clear visibility.",
56
+ "dim": "The sports area has reduced illumination, possibly before or after an event."
57
+ },
58
+ "kitchen_working": {
59
+ "general": "The scene is captured in a professional kitchen with task-oriented lighting.",
60
+ "bright": "The kitchen is intensely illuminated with clear, functional lighting.",
61
+ "medium": "The culinary space has standard working lights focused on preparation areas.",
62
+ "dim": "The kitchen has reduced lighting, possibly during off-peak hours."
63
+ },
64
+ "unknown": {
65
+ "general": "The lighting conditions in this scene are not easily determined."
66
+ }
67
+ },
68
+ "template_modifiers": {
69
+ "day_clear": "brightly-lit",
70
+ "day_cloudy": "softly-lit",
71
+ "sunset/sunrise": "warmly-lit",
72
+ "night": "night-time",
73
+ "indoor_bright": "well-lit indoor",
74
+ "indoor_moderate": "indoor",
75
+ "indoor_dim": "dimly-lit indoor",
76
+ "indoor_commercial": "retail-lit",
77
+ "indoor_restaurant": "atmospherically-lit",
78
+ "neon_night": "neon-illuminated",
79
+ "stadium_lighting": "flood-lit",
80
+ "mixed_lighting": "transitionally-lit",
81
+ "beach_lighting": "sun-drenched",
82
+ "sports_venue_lighting": "arena-lit",
83
+ "professional_kitchen_lighting": "kitchen-task lit",
84
+ "unknown": ""
85
+ },
86
+ "activity_modifiers": {
87
+ "day_clear": ["active", "lively", "busy"],
88
+ "day_cloudy": ["calm", "relaxed", "casual"],
89
+ "sunset/sunrise": ["peaceful", "transitional", "atmospheric"],
90
+ "night": ["quiet", "subdued", "nocturnal"],
91
+ "indoor_bright": ["focused", "productive", "engaged"],
92
+ "indoor_moderate": ["comfortable", "social", "casual"],
93
+ "indoor_dim": ["intimate", "relaxed", "private"],
94
+ "indoor_commercial": ["shopping", "browsing", "consumer-oriented"],
95
+ "indoor_restaurant": ["dining", "social", "culinary"],
96
+ "neon_night": ["vibrant", "energetic", "night-life"],
97
+ "stadium_lighting": ["event-focused", "spectator-oriented", "performance-based"],
98
+ "mixed_lighting": ["transitional", "adaptable", "variable"],
99
+ "unknown": []
100
+ },
101
+ "indoor_commercial": {
102
+ "general": "The scene is captured inside a commercial setting with retail-optimized lighting.",
103
+ "bright": "The space is brightly illuminated with commercial display lighting to highlight merchandise.",
104
+ "medium": "The commercial interior has standard retail lighting that balances visibility and ambiance.",
105
+ "dim": "The commercial space has subdued lighting creating an upscale or intimate shopping atmosphere."
106
+ },
107
+ "indoor_restaurant": {
108
+ "general": "The scene is captured inside a restaurant with characteristic dining lighting.",
109
+ "bright": "The restaurant is well-lit with clear illumination emphasizing food presentation.",
110
+ "medium": "The dining space has moderate lighting striking a balance between functionality and ambiance.",
111
+ "dim": "The restaurant features soft, low lighting creating an intimate dining atmosphere."
112
+ },
113
+ "neon_night": {
114
+ "general": "The scene is captured at night with colorful neon lighting typical of entertainment districts.",
115
+ "bright": "The night scene is illuminated by vibrant neon signs creating a lively, colorful atmosphere.",
116
+ "medium": "The evening setting features moderate neon lighting creating a characteristic urban nightlife scene.",
117
+ "dim": "The night area has subtle neon accents against the darkness, creating a moody urban atmosphere."
118
+ },
119
+ "stadium_lighting": {
120
+ "general": "The scene is captured under powerful stadium lights designed for spectator events.",
121
+ "bright": "The venue is intensely illuminated by stadium floodlights creating daylight-like conditions.",
122
+ "medium": "The sports facility has standard event lighting providing clear visibility across the venue.",
123
+ "dim": "The stadium has reduced illumination typical of pre-event or post-event conditions."
124
+ },
125
+ "mixed_lighting": {
126
+ "general": "The scene features a mix of indoor and outdoor lighting creating transitional illumination.",
127
+ "bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
128
+ "medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
129
+ "dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
130
+ }
131
+ }
object_categories.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ OBJECT_CATEGORIES = {
2
+ "furniture": [56, 57, 58, 59, 60, 61],
3
+ "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
4
+ "kitchen_items": [39, 40, 41, 42, 43, 44, 45],
5
+ "food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
6
+ "vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
7
+ "personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
8
+ }
object_template_fillers.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ OBJECT_TEMPLATE_FILLERS = {
3
+ "furniture": ["designer chairs", "wooden dining table", "stylish seating", "upholstered armchairs", "elegant dining furniture"],
4
+ "design_elements": ["art pieces", "decorative wreaths", "statement lighting", "seasonal decorations", "sophisticated decor"],
5
+ "lighting": ["pendant lights", "decorative fixtures", "geometric lighting", "modern chandeliers", "ambient illumination"],
6
+ "table_setup": ["elegantly set table", "tabletop decorations", "seasonal centerpieces", "formal place settings", "floral arrangements"],
7
+ "seating": ["upholstered chairs", "accent armchairs", "mixed seating styles", "designer dining chairs", "comfortable dining seats"],
8
+ "table_description": ["solid wood table", "designer dining table", "expansive dining surface", "artisanal table", "statement dining table"],
9
+
10
+ "storefront_features": ["multi-story shops", "illuminated signs", "merchandise displays", "compact storefronts", "vertical retail spaces"],
11
+ "pedestrian_flow": ["people walking", "shoppers", "pedestrians", "locals and tourists", "urban foot traffic"],
12
+ "asian_elements": ["Asian language signage", "decorative lanterns", "local storefronts", "character-based text", "regional design elements"],
13
+ "cultural_elements": ["red lanterns", "local typography", "distinctive architecture", "cultural symbols", "traditional decorations"],
14
+ "signage": ["bright store signs", "multilingual text", "vertical signboards", "neon displays", "electronic advertisements"],
15
+ "street_activities": ["shopping", "commuting", "socializing", "vendor transactions", "urban navigation"],
16
+
17
+ "buildings": ["high-rise office buildings", "corporate towers", "skyscrapers", "financial institutions", "commercial headquarters"],
18
+ "traffic_elements": ["vehicle lights", "trams/street cars", "lane markers", "traffic signals", "urban transit"],
19
+ "skyscrapers": ["glass and steel buildings", "tall structures", "modern architecture", "office towers", "urban high-rises"],
20
+ "road_features": ["wide avenues", "tram tracks", "traffic lanes", "median dividers", "urban throughways"],
21
+ "architectural_elements": ["contemporary buildings", "urban design", "varied architectural styles", "corporate architecture", "city planning features"],
22
+ "city_landmarks": ["distant bridge", "skyline features", "iconic structures", "urban monuments", "signature buildings"],
23
+
24
+ "crossing_pattern": ["zebra crosswalks", "pedestrian walkways", "crosswalk markings", "intersection design", "safety stripes"],
25
+ "pedestrian_density": ["groups of people", "commuters", "diverse pedestrians", "urban crowds", "varying foot traffic"],
26
+ "pedestrian_behavior": ["walking in different directions", "crossing together", "waiting for signals", "navigating intersections", "following traffic rules"],
27
+ "traffic_pattern": ["four-way intersection", "crossroad", "junction", "multi-directional traffic", "regulated crossing"],
28
+ "pedestrian_flow": ["people crossing", "directional movement", "coordinated crossing", "timed pedestrian traffic", "intersection navigation"],
29
+
30
+ "transit_vehicles": ["buses", "trams", "trains", "taxis", "shuttles"],
31
+ "passenger_activity": ["boarding", "waiting", "exiting vehicles", "checking schedules", "navigating stations"],
32
+ "transportation_modes": ["public transit", "private vehicles", "ride services", "light rail", "bus systems"],
33
+ "passenger_needs": ["waiting areas", "information displays", "ticketing services", "transit connections", "seating"],
34
+ "transit_infrastructure": ["stations", "platforms", "boarding areas", "transit lanes", "signaling systems"],
35
+ "passenger_movement": ["transfers", "entrances and exits", "queueing", "platform access", "terminal navigation"],
36
+
37
+ "retail_elements": ["storefronts", "display windows", "shopping bags", "merchandise", "retail signage"],
38
+ "shopping_activity": ["browsing", "carrying purchases", "window shopping", "social shopping", "consumer activities"],
39
+ "store_types": ["boutiques", "brand stores", "local shops", "chain retailers", "specialty stores"],
40
+ "walkway_features": ["pedestrian paths", "shopping promenades", "retail corridors", "commercial walkways", "shopping streets"],
41
+ "commercial_signage": ["brand logos", "sale announcements", "store names", "advertising displays", "digital signage"],
42
+ "consumer_behavior": ["shopping in groups", "individual browsing", "carrying bags", "examining products", "moving between stores"],
43
+
44
+ "beach_equipment": ["beach umbrellas", "surfboards", "beach towels", "sun protection", "recreational equipment"],
45
+ "water_activities": ["water sports", "surfing", "beach recreation", "sun bathing", "coastal leisure"],
46
+ "sports_equipment": ["game balls", "professional equipment", "athletic gear", "sports apparatus", "competition items"],
47
+ "competitive_activities": ["team sports", "athletic contests", "competitive games", "sporting events", "professional matches"],
48
+ "kitchen_equipment": ["professional appliances", "cooking stations", "preparation surfaces", "culinary tools", "industrial equipment"],
49
+ "food_preparation": ["meal production", "culinary operations", "food service preparation", "commercial cooking", "kitchen workflow"],
50
+
51
+ "crossing_pattern": ["grid-like pedestrian crossings", "multi-directional crosswalks", "cross-shaped intersection design", "perpendicular crossing lanes", "zebra-striped crosswalks viewed from above"],
52
+ "pedestrian_pattern": ["scattered distribution of people", "organized flow of pedestrians", "clustered gatherings", "radial movement patterns", "linear procession of individuals"],
53
+ "commercial_layout": ["parallel shopping streets", "interconnected shopping blocks", "radial marketplace design", "grid-like retail arrangement", "meandering commercial pathways"],
54
+ "movement_pattern": ["circular crowd motion", "directional pedestrian flow", "scattered individual movement", "converging foot traffic", "diverging pedestrian patterns"],
55
+
56
+ "stall_elements": ["food vendors with steaming woks", "trinket sellers with colorful displays", "lantern-lit stalls", "bamboo-framed shops", "canvas-covered market stands"],
57
+ "asian_elements": ["hanging red lanterns", "character-based signage", "ornate temple decorations", "traditional paper decorations", "stylized gateway arches"],
58
+ "cultural_lighting": ["paper lantern illumination", "neon character signs", "strung festival lights", "hanging light chains", "colorful shop front lighting"],
59
+ "architectural_elements": ["tiered pagoda roofs", "ornate dragon sculptures", "stone guardian statues", "intricately carved railings", "traditional wooden beams"],
60
+ "cultural_symbols": ["dharma wheels", "lotus motifs", "yin-yang symbols", "zodiac animal representations", "traditional calligraphy"],
61
+ "architectural_style": ["Baroque facades", "Gothic spires", "Renaissance colonnades", "Neoclassical pediments", "Medieval archways"],
62
+ "european_features": ["cobblestone paving", "ornate fountains", "bronze statuary", "wrought iron lampposts", "cafe terraces"],
63
+
64
+ "lighting_effects": ["streetlamp pools of light", "neon sign glow", "illuminated window squares", "headlight streams", "traffic signal flashes"],
65
+ "illuminated_elements": ["lit storefront windows", "glowing traffic signals", "illuminated advertising", "headlight-lit streets", "backlit silhouettes"],
66
+ "neon_elements": ["colorful shop signs", "animated light displays", "illuminated brand logos", "glowing storefront outlines", "digital advertising screens"],
67
+ "illuminated_signage": ["bright LED displays", "glowing brand names", "projected light advertisements", "illuminated menu boards", "digital information screens"],
68
+ "colorful_lighting": ["multi-colored neon", "warm ambient illumination", "cool blue accent lights", "festive string lighting", "dynamic color-changing displays"],
69
+
70
+ "transitional_elements": ["retractable glass walls", "indoor-outdoor bar counters", "terraced seating areas", "threshold planters", "partial canopy coverage"],
71
+ "indoor_features": ["climate-controlled spaces", "soft seating arrangements", "interior decor accents", "mood lighting fixtures", "sound-dampened areas"],
72
+ "outdoor_setting": ["sidewalk tables", "patio seating", "garden furniture", "open-air counters", "courtyard arrangements"],
73
+ "seating_arrangement": ["tiered spectator stands", "premium viewing boxes", "courtside seating", "general admission benches", "stadium chair rows"],
74
+ "playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
75
+ "construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
76
+ "medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
77
+ "educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"]
78
+ }
requirements.txt CHANGED
@@ -6,3 +6,4 @@ pillow>=9.4.0
6
  numpy>=1.23.5
7
  matplotlib>=3.7.0
8
  gradio>=3.32.0
 
 
6
  numpy>=1.23.5
7
  matplotlib>=3.7.0
8
  gradio>=3.32.0
9
+ git+https://github.com/openai/CLIP.git
room_02.jpg ADDED

Git LFS Details

  • SHA256: 1171134f1f68356aaa0639c029e1d9f2072452178b3ae714f269f969fb4e587e
  • Pointer size: 132 Bytes
  • Size of remote file: 2.36 MB
safety_templates.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ SAFETY_TEMPLATES = {
2
+ "general": "Pay attention to {safety_element}.",
3
+ "warning": "Be cautious of {hazard} in this environment.",
4
+ "notice": "Note the presence of {element_of_interest}."
5
+ }
scene_analyzer.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ from typing import Dict, List, Tuple, Any, Optional
4
+
5
+ from spatial_analyzer import SpatialAnalyzer
6
+ from scene_description import SceneDescriptor
7
+ from enhance_scene_describer import EnhancedSceneDescriber
8
+ from clip_analyzer import CLIPAnalyzer
9
+ from scene_type import SCENE_TYPES
10
+ from object_categories import OBJECT_CATEGORIES
11
+
12
+ class SceneAnalyzer:
13
+ """
14
+ Core class for scene analysis and understanding based on object detection results.
15
+ Analyzes detected objects, their relationships, and infers the scene type.
16
+ """
17
+ def __init__(self, class_names: Dict[int, str] = None):
18
+ """
19
+ Initialize the scene analyzer with optional class name mappings.
20
+
21
+ Args:
22
+ class_names: Dictionary mapping class IDs to class names (optional)
23
+ """
24
+ self.class_names = class_names
25
+
26
+ # 加載場景類型和物體類別
27
+ self.SCENE_TYPES = SCENE_TYPES
28
+ self.OBJECT_CATEGORIES = OBJECT_CATEGORIES
29
+
30
+ # 初始化其他組件,將數據傳遞給 SceneDescriptor
31
+ self.spatial_analyzer = SpatialAnalyzer(class_names=class_names, object_categories=self.OBJECT_CATEGORIES)
32
+ self.descriptor = SceneDescriptor(scene_types=self.SCENE_TYPES, object_categories=self.OBJECT_CATEGORIES)
33
+ self.scene_describer = EnhancedSceneDescriber(scene_types=self.SCENE_TYPES)
34
+
35
+ # 初始化 CLIP 分析器(新增)
36
+ try:
37
+ self.clip_analyzer = CLIPAnalyzer()
38
+ self.use_clip = True
39
+ except Exception as e:
40
+ print(f"Warning: Could not initialize CLIP analyzer: {e}")
41
+ print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
42
+ self.use_clip = False
43
+
44
+ def generate_scene_description(self,
45
+ scene_type,
46
+ detected_objects,
47
+ confidence,
48
+ lighting_info=None,
49
+ functional_zones=None):
50
+ """
51
+ 生成場景描述。
52
+
53
+ Args:
54
+ scene_type: 識別的場景類型
55
+ detected_objects: 檢測到的物體列表
56
+ confidence: 場景分類置信度
57
+ lighting_info: 照明條件信息(可選)
58
+ functional_zones: 功能區域信息(可選)
59
+
60
+ Returns:
61
+ str: 生成的場景描述
62
+ """
63
+ return self.scene_describer.generate_description(
64
+ scene_type,
65
+ detected_objects,
66
+ confidence,
67
+ lighting_info,
68
+ functional_zones
69
+ )
70
+
71
+ def _generate_scene_description(self, scene_type, detected_objects, confidence, lighting_info=None):
72
+ """
73
+ Use new implement
74
+ """
75
+ # 獲取功能區域信息(如果需要的話)
76
+ functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, scene_type)
77
+
78
+ # 使用增強的場景描述生成器
79
+ return self.generate_scene_description(
80
+ scene_type,
81
+ detected_objects,
82
+ confidence,
83
+ lighting_info,
84
+ functional_zones
85
+ )
86
+
87
+ def _define_image_regions(self):
88
+ """Define regions of the image for spatial analysis (3x3 grid)"""
89
+ self.regions = {
90
+ "top_left": (0, 0, 1/3, 1/3),
91
+ "top_center": (1/3, 0, 2/3, 1/3),
92
+ "top_right": (2/3, 0, 1, 1/3),
93
+ "middle_left": (0, 1/3, 1/3, 2/3),
94
+ "middle_center": (1/3, 1/3, 2/3, 2/3),
95
+ "middle_right": (2/3, 1/3, 1, 2/3),
96
+ "bottom_left": (0, 2/3, 1/3, 1),
97
+ "bottom_center": (1/3, 2/3, 2/3, 1),
98
+ "bottom_right": (2/3, 2/3, 1, 1)
99
+ }
100
+
101
+
102
+ def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
103
+ """
104
+ Analyze detection results to determine scene type and provide understanding.
105
+
106
+ Args:
107
+ detection_result: Detection result from YOLOv8
108
+ lighting_info: Optional lighting condition analysis results
109
+ class_confidence_threshold: Minimum confidence to consider an object
110
+ scene_confidence_threshold: Minimum confidence to determine a scene
111
+
112
+ Returns:
113
+ Dictionary with scene analysis results
114
+ """
115
+ # If no result or no detections, return empty analysis
116
+ if detection_result is None or len(detection_result.boxes) == 0:
117
+ return {
118
+ "scene_type": "unknown",
119
+ "confidence": 0,
120
+ "description": "No objects detected in the image.",
121
+ "objects_present": [],
122
+ "object_count": 0,
123
+ "regions": {},
124
+ "possible_activities": [],
125
+ "safety_concerns": [],
126
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
127
+ }
128
+
129
+ # Get class names from detection result if not already set
130
+ if self.class_names is None:
131
+ self.class_names = detection_result.names
132
+ # Also update class names in spatial analyzer
133
+ self.spatial_analyzer.class_names = self.class_names
134
+
135
+ # Extract detected objects with confidence above threshold
136
+ detected_objects = self.spatial_analyzer._extract_detected_objects(
137
+ detection_result,
138
+ confidence_threshold=class_confidence_threshold
139
+ )
140
+
141
+ # No objects above confidence threshold
142
+ if not detected_objects:
143
+ return {
144
+ "scene_type": "unknown",
145
+ "confidence": 0.0,
146
+ "description": "No objects with sufficient confidence detected.",
147
+ "objects_present": [],
148
+ "object_count": 0,
149
+ "regions": {},
150
+ "possible_activities": [],
151
+ "safety_concerns": [],
152
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
153
+ }
154
+
155
+ # Analyze object distribution in regions
156
+ region_analysis = self.spatial_analyzer._analyze_regions(detected_objects)
157
+
158
+ # Compute scene type scores based on object detection
159
+ yolo_scene_scores = self._compute_scene_scores(detected_objects)
160
+
161
+ # 使用 CLIP 分析圖像
162
+ clip_scene_scores = {}
163
+ clip_analysis = None
164
+ if self.use_clip:
165
+ try:
166
+ # 獲取原始圖像
167
+ original_image = detection_result.orig_img
168
+
169
+ # Use CLIP analyze image
170
+ clip_analysis = self.clip_analyzer.analyze_image(original_image)
171
+
172
+ # get CLIP's score
173
+ clip_scene_scores = clip_analysis.get("scene_scores", {})
174
+
175
+ if "asian_commercial_street" in clip_scene_scores and clip_scene_scores["asian_commercial_street"] > 0.2:
176
+ # 使用對比提示進一步區分室內/室外
177
+ comparative_results = self.clip_analyzer.calculate_similarity(
178
+ original_image,
179
+ self.clip_analyzer.comparative_prompts["indoor_vs_outdoor"]
180
+ )
181
+
182
+ # 分析對比結果
183
+ indoor_score = sum(s for p, s in comparative_results.items() if "indoor" in p or "enclosed" in p)
184
+ outdoor_score = sum(s for p, s in comparative_results.items() if "outdoor" in p or "open-air" in p)
185
+
186
+ # 如果 CLIP 認為這是室外場景,且光照分析認為是室內
187
+ if outdoor_score > indoor_score and lighting_info and lighting_info.get("is_indoor", False):
188
+ # 修正光照分析結果
189
+ print(f"CLIP indicates outdoor commercial street (score: {outdoor_score:.2f} vs {indoor_score:.2f}), adjusting lighting analysis")
190
+ lighting_info["is_indoor"] = False
191
+ lighting_info["indoor_probability"] = 0.3
192
+ # 把CLIP 分析結果加到光照診斷
193
+ if "diagnostics" not in lighting_info:
194
+ lighting_info["diagnostics"] = {}
195
+ lighting_info["diagnostics"]["clip_override"] = {
196
+ "reason": "CLIP detected outdoor commercial street",
197
+ "outdoor_score": float(outdoor_score),
198
+ "indoor_score": float(indoor_score)
199
+ }
200
+
201
+ # 如果 CLIP 檢測到了光照條件但沒有提供 lighting_info
202
+ if not lighting_info and "lighting_condition" in clip_analysis:
203
+ lighting_type, lighting_conf = clip_analysis["lighting_condition"]
204
+ lighting_info = {
205
+ "time_of_day": lighting_type,
206
+ "confidence": lighting_conf
207
+ }
208
+ except Exception as e:
209
+ print(f"Error in CLIP analysis: {e}")
210
+
211
+ # 融合 YOLO 和 CLIP 的場景分數
212
+ scene_scores = self._fuse_scene_scores(yolo_scene_scores, clip_scene_scores)
213
+
214
+ # Determine best matching scene type
215
+ best_scene, scene_confidence = self._determine_scene_type(scene_scores)
216
+
217
+ # Generate possible activities based on scene
218
+ activities = self.descriptor._infer_possible_activities(best_scene, detected_objects)
219
+
220
+ # Identify potential safety concerns
221
+ safety_concerns = self.descriptor._identify_safety_concerns(detected_objects, best_scene)
222
+
223
+ # Calculate functional zones
224
+ functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, best_scene)
225
+
226
+ # Generate scene description
227
+ scene_description = self.generate_scene_description(
228
+ best_scene,
229
+ detected_objects,
230
+ scene_confidence,
231
+ lighting_info=lighting_info,
232
+ functional_zones=functional_zones
233
+ )
234
+
235
+ # Return comprehensive analysis
236
+ result = {
237
+ "scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
238
+ "scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown")
239
+ if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
240
+ "confidence": scene_confidence,
241
+ "description": scene_description,
242
+ "objects_present": [
243
+ {"class_id": obj["class_id"],
244
+ "class_name": obj["class_name"],
245
+ "confidence": obj["confidence"]}
246
+ for obj in detected_objects
247
+ ],
248
+ "object_count": len(detected_objects),
249
+ "regions": region_analysis,
250
+ "possible_activities": activities,
251
+ "safety_concerns": safety_concerns,
252
+ "functional_zones": functional_zones,
253
+ "alternative_scenes": self.descriptor._get_alternative_scenes(scene_scores, scene_confidence_threshold, top_k=2),
254
+ "lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
255
+ }
256
+
257
+ # 添加 CLIP 特定的結果(新增)
258
+ if clip_analysis and "error" not in clip_analysis:
259
+ result["clip_analysis"] = {
260
+ "top_scene": clip_analysis.get("top_scene", ("unknown", 0.0)),
261
+ "cultural_analysis": clip_analysis.get("cultural_analysis", {})
262
+ }
263
+
264
+ return result
265
+
266
+ def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
267
+ """
268
+ Compute confidence scores for each scene type based on detected objects.
269
+
270
+ Args:
271
+ detected_objects: List of detected objects
272
+
273
+ Returns:
274
+ Dictionary mapping scene types to confidence scores
275
+ """
276
+ scene_scores = {}
277
+ detected_class_ids = [obj["class_id"] for obj in detected_objects]
278
+ detected_classes_set = set(detected_class_ids)
279
+
280
+ # Count occurrence of each class
281
+ class_counts = {}
282
+ for obj in detected_objects:
283
+ class_id = obj["class_id"]
284
+ if class_id not in class_counts:
285
+ class_counts[class_id] = 0
286
+ class_counts[class_id] += 1
287
+
288
+ # Evaluate each scene type
289
+ for scene_type, scene_def in self.SCENE_TYPES.items():
290
+ # Count required objects present
291
+ required_objects = set(scene_def["required_objects"])
292
+ required_present = required_objects.intersection(detected_classes_set)
293
+
294
+ # Count optional objects present
295
+ optional_objects = set(scene_def["optional_objects"])
296
+ optional_present = optional_objects.intersection(detected_classes_set)
297
+
298
+ # Skip if minimum required objects aren't present
299
+ if len(required_present) < scene_def["minimum_required"]:
300
+ scene_scores[scene_type] = 0
301
+ continue
302
+
303
+ # Base score from required objects
304
+ required_ratio = len(required_present) / max(1, len(required_objects))
305
+ required_score = required_ratio * 0.7 # 70% of score from required objects
306
+
307
+ # Additional score from optional objects
308
+ optional_ratio = len(optional_present) / max(1, len(optional_objects))
309
+ optional_score = optional_ratio * 0.3 # 30% of score from optional objects
310
+
311
+ # Bonus for having multiple instances of key objects
312
+ multiple_bonus = 0.0
313
+ for class_id in required_present:
314
+ if class_counts.get(class_id, 0) > 1:
315
+ multiple_bonus += 0.05 # 5% bonus per additional key object type
316
+
317
+ # Cap the bonus at 15%
318
+ multiple_bonus = min(0.15, multiple_bonus)
319
+
320
+ # Calculate final score
321
+ final_score = required_score + optional_score + multiple_bonus
322
+
323
+ if "priority" in scene_def:
324
+ final_score *= scene_def["priority"]
325
+
326
+ # Normalize to 0-1 range
327
+ scene_scores[scene_type] = min(1.0, final_score)
328
+
329
+ return scene_scores
330
+
331
+ def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
332
+ """
333
+ Determine the most likely scene type based on scores.
334
+
335
+ Args:
336
+ scene_scores: Dictionary mapping scene types to confidence scores
337
+
338
+ Returns:
339
+ Tuple of (best_scene_type, confidence)
340
+ """
341
+ if not scene_scores:
342
+ return "unknown", 0
343
+
344
+ # Find scene with highest score
345
+ best_scene = max(scene_scores, key=scene_scores.get)
346
+ best_score = scene_scores[best_scene]
347
+
348
+ return best_scene, best_score
349
+
350
+
351
+ def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
352
+ """
353
+ 融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
354
+
355
+ Args:
356
+ yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
357
+ clip_scene_scores: 基於 CLIP 分析的場景分數
358
+
359
+ Returns:
360
+ Dict: 融合後的場景分數
361
+ """
362
+ # 如果沒有 CLIP 分數,直接返回 YOLO 分數
363
+ if not clip_scene_scores:
364
+ return yolo_scene_scores
365
+
366
+ # 如果沒有 YOLO 分數,直接返回 CLIP 分數
367
+ if not yolo_scene_scores:
368
+ return clip_scene_scores
369
+
370
+ # 融合分數
371
+ fused_scores = {}
372
+
373
+ # 獲取所有場景類型
374
+ all_scene_types = set(list(yolo_scene_scores.keys()) + list(clip_scene_scores.keys()))
375
+
376
+ for scene_type in all_scene_types:
377
+ # 獲取兩個模型的分數
378
+ yolo_score = yolo_scene_scores.get(scene_type, 0.0)
379
+ clip_score = clip_scene_scores.get(scene_type, 0.0)
380
+
381
+ # 設置基本權重
382
+ yolo_weight = 0.7 # YOLO 提供更詳細的物體資訊
383
+ clip_weight = 0.3 # CLIP 提供更好的整體場景理解
384
+
385
+ # 對特定類型場景調整權重
386
+ # 文化特定場景或具有特殊布局的場景,CLIP 可能有優勢
387
+ if any(keyword in scene_type for keyword in ["asian", "cultural", "aerial"]):
388
+ yolo_weight = 0.3
389
+ clip_weight = 0.7
390
+
391
+ # 對室內家居場景,物體檢測通常更準確
392
+ elif any(keyword in scene_type for keyword in ["room", "kitchen", "office", "bedroom"]):
393
+ yolo_weight = 0.8
394
+ clip_weight = 0.2
395
+ elif scene_type == "beach_water_recreation":
396
+ yolo_weight = 0.8 # 衝浪板等特定物品的檢測非常重要
397
+ clip_weight = 0.2
398
+ elif scene_type == "sports_venue":
399
+ yolo_weight = 0.7
400
+ clip_weight = 0.3
401
+ elif scene_type == "professional_kitchen":
402
+ yolo_weight = 0.8 # 廚房用具的檢測非常重要
403
+ clip_weight = 0.2
404
+
405
+ # 計算加權分數
406
+ fused_scores[scene_type] = (yolo_score * yolo_weight) + (clip_score * clip_weight)
407
+
408
+ return fused_scores
scene_description.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict, List, Tuple, Any, Optional
4
+
5
+ from scene_type import SCENE_TYPES
6
+ from scene_detail_templates import SCENE_DETAIL_TEMPLATES
7
+ from object_template_fillers import OBJECT_TEMPLATE_FILLERS
8
+ from activity_templates import ACTIVITY_TEMPLATES
9
+ from safety_templates import SAFETY_TEMPLATES
10
+ from confifence_templates import CONFIDENCE_TEMPLATES
11
+
12
+ class SceneDescriptor:
13
+ """
14
+ Generates natural language descriptions of scenes.
15
+ Handles scene descriptions, activity inference, and safety concerns identification.
16
+ """
17
+
18
+ def __init__(self, scene_types=None, object_categories=None):
19
+ """
20
+ Initialize the scene descriptor
21
+
22
+ Args:
23
+ scene_types: Dictionary of scene type definitions
24
+ """
25
+ self.scene_types = scene_types or {}
26
+ self.SCENE_TYPES = scene_types or {}
27
+
28
+ if object_categories:
29
+ self.OBJECT_CATEGORIES = object_categories
30
+ else:
31
+ # 從 JSON 加載或使用默認值
32
+ self.OBJECT_CATEGORIES = self._load_json_data("object_categories") or {
33
+ "furniture": [56, 57, 58, 59, 60, 61],
34
+ "electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
35
+ "kitchen_items": [39, 40, 41, 42, 43, 44, 45],
36
+ "food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
37
+ "vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
38
+ "personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
39
+ }
40
+
41
+ # 加載所有模板數據
42
+ self._load_templates()
43
+
44
+ def _load_templates(self):
45
+ """Load all template data from script or fallback to imported defaults"""
46
+ self.confidence_templates = CONFIDENCE_TEMPLATES
47
+ self.scene_detail_templates = SCENE_DETAIL_TEMPLATES
48
+ self.object_template_fillers = OBJECT_TEMPLATE_FILLERS
49
+ self.safety_templates = SAFETY_TEMPLATES
50
+ self.activity_templates = ACTIVITY_TEMPLATES
51
+
52
+
53
+ def _initialize_fallback_templates(self):
54
+ """Initialize fallback templates when no external data is available"""
55
+ # 只在無法從文件或導入加載時使用
56
+ self.confidence_templates = {
57
+ "high": "{description} {details}",
58
+ "medium": "This appears to be {description} {details}",
59
+ "low": "This might be {description}, but the confidence is low. {details}"
60
+ }
61
+
62
+ # 僅提供最基本的模板作為後備
63
+ self.scene_detail_templates = {
64
+ "default": ["A space with various objects."]
65
+ }
66
+
67
+ self.object_template_fillers = {
68
+ "default": ["various items"]
69
+ }
70
+
71
+ self.safety_templates = {
72
+ "general": "Pay attention to {safety_element}."
73
+ }
74
+
75
+ self.activity_templates = {
76
+ "default": ["General activity"]
77
+ }
78
+
79
+ def _get_alternative_scenes(self, scene_scores: Dict[str, float],
80
+ threshold: float, top_k: int = 2) -> List[Dict]:
81
+ """
82
+ Get alternative scene interpretations with their scores.
83
+
84
+ Args:
85
+ scene_scores: Dictionary of scene type scores
86
+ threshold: Minimum confidence threshold
87
+ top_k: Number of alternatives to return
88
+
89
+ Returns:
90
+ List of dictionaries with alternative scenes
91
+ """
92
+ # Sort scenes by score in descending order
93
+ sorted_scenes = sorted(scene_scores.items(), key=lambda x: x[1], reverse=True)
94
+
95
+ # Skip the first one (best match) and take the next top_k
96
+ alternatives = []
97
+ for scene_type, score in sorted_scenes[1:1+top_k]:
98
+ if score >= threshold:
99
+ alternatives.append({
100
+ "type": scene_type,
101
+ "name": self.SCENE_TYPES.get(scene_type, {}).get("name", "Unknown"),
102
+ "confidence": score
103
+ })
104
+
105
+ return alternatives
106
+
107
+
108
+ def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
109
+ """
110
+ Infer possible activities based on scene type and detected objects.
111
+
112
+ Args:
113
+ scene_type: Identified scene type
114
+ detected_objects: List of detected objects
115
+
116
+ Returns:
117
+ List of possible activities
118
+ """
119
+ activities = []
120
+
121
+ if scene_type.startswith("aerial_view_"):
122
+ if scene_type == "aerial_view_intersection":
123
+ # 使用預定義的十字路口活動
124
+ activities.extend(self.activity_templates.get("aerial_view_intersection", []))
125
+
126
+ # 添加與行人和車輛相關的特定活動
127
+ pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
128
+ vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]] # Car, bus, truck
129
+
130
+ if pedestrians and vehicles:
131
+ activities.append("Waiting for an opportunity to cross the street")
132
+ activities.append("Obeying traffic signals")
133
+
134
+ elif scene_type == "aerial_view_commercial_area":
135
+ activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
136
+
137
+ elif scene_type == "aerial_view_plaza":
138
+ activities.extend(self.activity_templates.get("aerial_view_plaza", []))
139
+
140
+ else:
141
+ # 處理其他未明確定義的空中視角場景
142
+ aerial_activities = [
143
+ "Street crossing",
144
+ "Waiting for signals",
145
+ "Following traffic rules",
146
+ "Pedestrian movement"
147
+ ]
148
+ activities.extend(aerial_activities)
149
+
150
+ if scene_type in self.activity_templates:
151
+ activities.extend(self.activity_templates[scene_type])
152
+ elif "default" in self.activity_templates:
153
+ activities.extend(self.activity_templates["default"])
154
+
155
+ detected_class_ids = [obj["class_id"] for obj in detected_objects]
156
+
157
+ # Add activities based on specific object combinations
158
+ if 62 in detected_class_ids and 57 in detected_class_ids: # TV and sofa
159
+ activities.append("Watching shows or movies")
160
+
161
+ if 63 in detected_class_ids: # laptop
162
+ activities.append("Using a computer/laptop")
163
+
164
+ if 67 in detected_class_ids: # cell phone
165
+ activities.append("Using a mobile phone")
166
+
167
+ if 73 in detected_class_ids: # book
168
+ activities.append("Reading")
169
+
170
+ if any(food_id in detected_class_ids for food_id in [46, 47, 48, 49, 50, 51, 52, 53, 54, 55]):
171
+ activities.append("Eating or preparing food")
172
+
173
+ # Person-specific activities
174
+ if 0 in detected_class_ids: # Person
175
+ if any(vehicle in detected_class_ids for vehicle in [1, 2, 3, 5, 7]): # Vehicles
176
+ activities.append("Commuting or traveling")
177
+
178
+ if 16 in detected_class_ids: # Dog
179
+ activities.append("Walking a dog")
180
+
181
+ if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
182
+ activities.append("Carrying personal items")
183
+
184
+ # Remove duplicates
185
+ return list(set(activities))
186
+
187
+ def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
188
+ """
189
+ Identify potential safety concerns based on objects and scene type.
190
+
191
+ Args:
192
+ detected_objects: List of detected objects
193
+ scene_type: Identified scene type
194
+
195
+ Returns:
196
+ List of potential safety concerns
197
+ """
198
+ concerns = []
199
+ detected_class_ids = [obj["class_id"] for obj in detected_objects]
200
+
201
+ # ORIGINAL SAFETY CONCERNS LOGIC
202
+
203
+ # General safety concerns
204
+ if 42 in detected_class_ids or 43 in detected_class_ids: # Fork or knife
205
+ concerns.append("Sharp utensils present")
206
+
207
+ if 76 in detected_class_ids: # Scissors
208
+ concerns.append("Cutting tools present")
209
+
210
+ # Traffic-related concerns
211
+ if scene_type in ["city_street", "parking_lot"]:
212
+ if 0 in detected_class_ids: # Person
213
+ if any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7, 8]): # Vehicles
214
+ concerns.append("Pedestrians near vehicles")
215
+
216
+ if 9 in detected_class_ids: # Traffic light
217
+ concerns.append("Monitor traffic signals")
218
+
219
+ # Identify crowded scenes
220
+ person_count = detected_class_ids.count(0)
221
+ if person_count > 5:
222
+ concerns.append(f"Crowded area with multiple people ({person_count})")
223
+
224
+ # Scene-specific concerns
225
+ if scene_type == "kitchen":
226
+ if 68 in detected_class_ids or 69 in detected_class_ids: # Microwave or oven
227
+ concerns.append("Hot cooking equipment")
228
+
229
+ # Potentially unstable objects
230
+ for obj in detected_objects:
231
+ if obj["class_id"] in [39, 40, 41, 45]: # Bottle, wine glass, cup, bowl
232
+ if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
233
+ concerns.append(f"Elevated {obj['class_name']} might be unstable")
234
+
235
+ # NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES
236
+
237
+ # Upscale dining safety concerns
238
+ if scene_type == "upscale_dining":
239
+ # Check for fragile items
240
+ if 40 in detected_class_ids: # Wine glass
241
+ concerns.append("Fragile glassware present")
242
+
243
+ # Check for lit candles (can't directly detect but can infer from context)
244
+ # Look for small bright spots that might be candles
245
+ if any(obj["class_id"] == 41 for obj in detected_objects): # Cup (which might include candle holders)
246
+ # We can't reliably detect candles, but if the scene appears to be formal dining,
247
+ # we can suggest this as a possibility
248
+ concerns.append("Possible lit candles or decorative items requiring care")
249
+
250
+ # Check for overcrowded table
251
+ table_objs = [obj for obj in detected_objects if obj["class_id"] == 60] # Dining table
252
+ if table_objs:
253
+ table_region = table_objs[0]["region"]
254
+ items_on_table = 0
255
+
256
+ for obj in detected_objects:
257
+ if obj["class_id"] in [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]:
258
+ if obj["region"] == table_region:
259
+ items_on_table += 1
260
+
261
+ if items_on_table > 8:
262
+ concerns.append("Dining table has multiple items which should be handled with care")
263
+
264
+ # Asian commercial street safety concerns
265
+ elif scene_type == "asian_commercial_street":
266
+ # Check for crowded walkways
267
+ if 0 in detected_class_ids: # Person
268
+ person_count = detected_class_ids.count(0)
269
+ if person_count > 3:
270
+ # Calculate person density (simplified)
271
+ person_positions = []
272
+ for obj in detected_objects:
273
+ if obj["class_id"] == 0:
274
+ person_positions.append(obj["normalized_center"])
275
+
276
+ if len(person_positions) >= 2:
277
+ # Calculate average distance between people
278
+ total_distance = 0
279
+ count = 0
280
+ for i in range(len(person_positions)):
281
+ for j in range(i+1, len(person_positions)):
282
+ p1 = person_positions[i]
283
+ p2 = person_positions[j]
284
+ distance = ((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)**0.5
285
+ total_distance += distance
286
+ count += 1
287
+
288
+ if count > 0:
289
+ avg_distance = total_distance / count
290
+ if avg_distance < 0.1: # Close proximity
291
+ concerns.append("Crowded walkway with limited personal space")
292
+
293
+ # Check for motorcycles/bicycles near pedestrians
294
+ if (1 in detected_class_ids or 3 in detected_class_ids) and 0 in detected_class_ids: # Bicycle/motorcycle and person
295
+ concerns.append("Two-wheeled vehicles in pedestrian areas")
296
+
297
+ # Check for potential trip hazards
298
+ # We can't directly detect this, but can infer from context
299
+ if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
300
+ # If people are in bottom regions, they might be walking on uneven surfaces
301
+ concerns.append("Potential uneven walking surfaces in commercial area")
302
+
303
+ # Financial district safety concerns
304
+ elif scene_type == "financial_district":
305
+ # Check for heavy traffic conditions
306
+ vehicle_count = sum(1 for obj_id in detected_class_ids if obj_id in [2, 5, 7]) # Car, bus, truck
307
+ if vehicle_count > 5:
308
+ concerns.append("Heavy vehicle traffic in urban area")
309
+
310
+ # Check for pedestrians crossing busy streets
311
+ if 0 in detected_class_ids: # Person
312
+ person_count = detected_class_ids.count(0)
313
+ vehicle_nearby = any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7])
314
+
315
+ if person_count > 0 and vehicle_nearby:
316
+ concerns.append("Pedestrians navigating busy urban traffic")
317
+
318
+ # Check for traffic signals
319
+ if 9 in detected_class_ids: # Traffic light
320
+ concerns.append("Observe traffic signals when navigating this area")
321
+ else:
322
+ # If no traffic lights detected but it's a busy area, it's worth noting
323
+ if vehicle_count > 3:
324
+ concerns.append("Busy traffic area potentially without visible traffic signals in view")
325
+
326
+ # Time of day considerations
327
+ # We don't have direct time data, but can infer from vehicle lights
328
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
329
+ if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
330
+ # If vehicles are present and it might be evening/night
331
+ concerns.append("Reduced visibility conditions during evening commute")
332
+
333
+ # Urban intersection safety concerns
334
+ elif scene_type == "urban_intersection":
335
+ # Check for pedestrians in crosswalks
336
+ pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
337
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 3, 5, 7]]
338
+
339
+ if pedestrian_objs:
340
+ # Calculate distribution of pedestrians to see if they're crossing
341
+ pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
342
+
343
+ # Simplified check for pedestrians in crossing pattern
344
+ if len(pedestrian_positions) >= 3:
345
+ # Check if pedestrians are distributed across different regions
346
+ pedestrian_regions = set(obj["region"] for obj in pedestrian_objs)
347
+ if len(pedestrian_regions) >= 2:
348
+ concerns.append("Multiple pedestrians crossing the intersection")
349
+
350
+ # Check for traffic signal observation
351
+ if 9 in detected_class_ids: # Traffic light
352
+ concerns.append("Observe traffic signals when crossing")
353
+
354
+ # Check for busy intersection
355
+ if len(vehicle_objs) > 3:
356
+ concerns.append("Busy intersection with multiple vehicles")
357
+
358
+ # Check for pedestrians potentially jay-walking
359
+ if pedestrian_objs and not 9 in detected_class_ids: # People but no traffic lights
360
+ concerns.append("Pedestrians should use designated crosswalks")
361
+
362
+ # Visibility concerns based on lighting
363
+ # This would be better with actual lighting data
364
+ pedestrian_count = len(pedestrian_objs)
365
+ if pedestrian_count > 5:
366
+ concerns.append("High pedestrian density at crossing points")
367
+
368
+ # Transit hub safety concerns
369
+ elif scene_type == "transit_hub":
370
+ # These would be for transit areas like train stations or bus terminals
371
+ if 0 in detected_class_ids: # Person
372
+ person_count = detected_class_ids.count(0)
373
+ if person_count > 8:
374
+ concerns.append("Crowded transit area requiring careful navigation")
375
+
376
+ # Check for luggage/bags that could be trip hazards
377
+ if 24 in detected_class_ids or 28 in detected_class_ids: # Backpack or suitcase
378
+ concerns.append("Luggage and personal items may create obstacles")
379
+
380
+ # Public transportation vehicles
381
+ if any(vehicle in detected_class_ids for vehicle in [5, 6, 7]): # Bus, train, truck
382
+ concerns.append("Stay clear of arriving and departing transit vehicles")
383
+
384
+ # Shopping district safety concerns
385
+ elif scene_type == "shopping_district":
386
+ # Check for crowded shopping areas
387
+ if 0 in detected_class_ids: # Person
388
+ person_count = detected_class_ids.count(0)
389
+ if person_count > 5:
390
+ concerns.append("Crowded shopping area with multiple people")
391
+
392
+ # Check for shopping bags and personal items
393
+ if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
394
+ concerns.append("Mind personal belongings in busy retail environment")
395
+
396
+ # Check for store entrances/exits which might have automatic doors
397
+ # We can't directly detect this, but can infer from context
398
+ if scene_type == "shopping_district" and 0 in detected_class_ids:
399
+ concerns.append("Be aware of store entrances and exits with potential automatic doors")
400
+
401
+ return concerns
scene_detail_templates.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ SCENE_DETAIL_TEMPLATES = {
3
+ "living_room": [
4
+ "The space is arranged for relaxation with {furniture}.",
5
+ "There is {electronics} for entertainment.",
6
+ "The room has a seating area with {seating}."
7
+ ],
8
+ "bedroom": [
9
+ "The room contains {bed_type} in the {bed_location}.",
10
+ "This sleeping area has {bed_description}.",
11
+ "A personal space with {bed_type} and {extras}."
12
+ ],
13
+ "dining_area": [
14
+ "A space set up for meals with {table_setup}.",
15
+ "The dining area contains {table_description}.",
16
+ "A place for eating with {dining_items}."
17
+ ],
18
+ "kitchen": [
19
+ "A food preparation area with {appliances}.",
20
+ "The kitchen contains {kitchen_items}.",
21
+ "A cooking space equipped with {cooking_equipment}."
22
+ ],
23
+ "office_workspace": [
24
+ "A work environment with {office_equipment}.",
25
+ "A space designed for productivity with {desk_setup}.",
26
+ "A workspace containing {computer_equipment}."
27
+ ],
28
+ "city_street": [
29
+ "An urban thoroughfare with {traffic_description}.",
30
+ "A street scene with {people_and_vehicles}.",
31
+ "A city path with {street_elements}."
32
+ ],
33
+ "park_area": [
34
+ "An outdoor recreational space with {park_features}.",
35
+ "A leisure area featuring {outdoor_elements}.",
36
+ "A public outdoor space with {park_description}."
37
+ ],
38
+ "retail_store": [
39
+ "A shopping environment with {store_elements}.",
40
+ "A commercial space where {shopping_activity}.",
41
+ "A retail area containing {store_items}."
42
+ ],
43
+ "upscale_dining": [
44
+ "The space features {furniture} with {design_elements} for an elegant dining experience.",
45
+ "This sophisticated dining area includes {lighting} illuminating {table_setup}.",
46
+ "A stylish dining environment with {seating} arranged around {table_description}."
47
+ ],
48
+ "asian_commercial_street": [
49
+ "A vibrant street lined with {storefront_features} and filled with {pedestrian_flow}.",
50
+ "This urban commercial area displays {asian_elements} with {cultural_elements}.",
51
+ "A lively shopping street characterized by {signage} and busy with {street_activities}."
52
+ ],
53
+ "financial_district": [
54
+ "A canyon of {buildings} with {traffic_elements} moving through the urban landscape.",
55
+ "This business district features {skyscrapers} along {road_features}.",
56
+ "A downtown corridor with {architectural_elements} framing views of {city_landmarks}."
57
+ ],
58
+ "urban_intersection": [
59
+ "A busy crossroad with {crossing_pattern} where {pedestrian_behavior} is observed.",
60
+ "This urban junction features {pedestrian_density} navigating the {traffic_pattern}.",
61
+ "A well-marked intersection designed for {pedestrian_flow} across multiple directions."
62
+ ],
63
+ "transit_hub": [
64
+ "A transportation nexus where {transit_vehicles} arrive and depart amid {passenger_activity}.",
65
+ "This transit center accommodates {transportation_modes} with facilities for {passenger_needs}.",
66
+ "A busy transport hub featuring {transit_infrastructure} and areas for {passenger_movement}."
67
+ ],
68
+ "shopping_district": [
69
+ "A commercial zone filled with {retail_elements} and {shopping_activity}.",
70
+ "This shopping area features {store_types} along {walkway_features}.",
71
+ "A retail district characterized by {commercial_signage} and {consumer_behavior}."
72
+ ],
73
+ "bus_stop": [
74
+ "Passengers waiting at a roadside stop served by {transit_vehicles}.",
75
+ "A designated bus stop with shelters and {passenger_activity}.",
76
+ "Commuters boarding or alighting from {transit_vehicles} at the curb."
77
+ ],
78
+ "bus_station": [
79
+ "Multiple buses parked in a terminal where {passenger_activity}.",
80
+ "A busy station hub featuring {transit_vehicles} and traveler luggage.",
81
+ "A transit center with waiting areas and various {transportation_modes}."
82
+ ],
83
+ "zoo": [
84
+ "Enclosures showcasing elephants, zebras, and giraffes with visitors observing.",
85
+ "A wildlife exhibit area where families watch animal displays.",
86
+ "A recreational space featuring large animal exhibits and strolling guests."
87
+ ],
88
+ "harbor": [
89
+ "Boats docked along the waterfront with nearby vehicular traffic.",
90
+ "A maritime area where vessels anchor beside roads busy with cars and motorcycles.",
91
+ "A coastal dock featuring moored boats and passing traffic elements."
92
+ ],
93
+ "playground": [
94
+ "An open play area equipped with balls and recreational structures.",
95
+ "People engaging in games and sports in a communal space.",
96
+ "A leisure area featuring playground equipment and active participants."
97
+ ],
98
+ "sports_field": [
99
+ "An athletic field marked for various ball games and matches.",
100
+ "Players using equipment like bats, gloves, and rackets on a grassy pitch.",
101
+ "A designated sports area with goalposts or markings for competitive play."
102
+ ],
103
+ "narrow_commercial_alley": [
104
+ "A tight alley lined with {storefront_features} and light vehicles.",
105
+ "Pedestrians navigate a confined lane flanked by shops and {street_activities}.",
106
+ "An urban passage featuring {storefront_features} with {people_and_vehicles}."
107
+ ],
108
+ "daytime_shopping_street": [
109
+ "A bustling street during daytime with {storefront_features} and {pedestrian_flow}.",
110
+ "Shoppers and vehicles move along a retail strip marked by {signage}.",
111
+ "An open commercial avenue filled with {people_and_vehicles} amid shops."
112
+ ],
113
+ "urban_pedestrian_crossing": [
114
+ "A marked crosswalk with {crossing_pattern} under {lighting_modifier} sky.",
115
+ "Pedestrians use designated crossing with {traffic_pattern} at the intersection.",
116
+ "People waiting at a signal-controlled crossing next to {street_elements}."
117
+ ],
118
+ "aerial_view_intersection": [
119
+ "The crossing pattern shows {crossing_pattern} with {pedestrian_flow} across multiple directions.",
120
+ "From above, this intersection reveals {traffic_pattern} with {pedestrian_density} navigating through defined paths.",
121
+ "This bird's-eye view shows {street_elements} converging at a junction where {pedestrian_behavior} is visible."
122
+ ],
123
+ "aerial_view_commercial_area": [
124
+ "From above, this commercial zone shows {storefront_features} with {pedestrian_flow} moving between establishments.",
125
+ "This overhead view reveals {shopping_activity} amid {walkway_features} connecting different businesses.",
126
+ "The aerial perspective captures {retail_elements} organized along {commercial_layout} with visible customer activity."
127
+ ],
128
+ "aerial_view_plaza": [
129
+ "This overhead view of the plaza shows {pedestrian_pattern} across an open public space.",
130
+ "From above, the plaza reveals {gathering_features} where people congregate in {movement_pattern}.",
131
+ "The aerial perspective captures {urban_elements} arranged around a central area where {public_activity} occurs."
132
+ ],
133
+ "asian_night_market": [
134
+ "This bustling night market features {stall_elements} illuminated by {lighting_features} with crowds enjoying {food_elements}.",
135
+ "Rows of {vendor_stalls} line this vibrant market where {nighttime_activity} continues under {cultural_lighting}.",
136
+ "The market atmosphere is created by {asian_elements} and {night_market_sounds} amid {evening_crowd_behavior}."
137
+ ],
138
+ "asian_temple_area": [
139
+ "This sacred space features {architectural_elements} displaying {cultural_symbols} with visitors engaging in {ritual_activities}.",
140
+ "The temple area contains {religious_structures} adorned with {decorative_features} where people practice {cultural_practices}.",
141
+ "Traditional {temple_architecture} creates a spiritual atmosphere enhanced by {sensory_elements} and {visitor_activities}."
142
+ ],
143
+ "european_plaza": [
144
+ "This historic plaza is framed by {architectural_style} surrounding an open space where {public_activities} take place.",
145
+ "The European square features {historic_elements} and {urban_design} creating a space for {social_behaviors}.",
146
+ "Classical {european_features} define this public space where {tourist_activities} blend with {local_customs}."
147
+ ],
148
+ "nighttime_street": [
149
+ "The night transforms this street with {lighting_effects} casting {shadow_patterns} across {urban_features}.",
150
+ "After dark, this urban corridor is defined by {illuminated_elements} with {evening_activities} visible in the artificial light.",
151
+ "The nocturnal street scene captures {light_sources} creating contrast between {lit_areas} and {shadowed_zones}."
152
+ ],
153
+ "nighttime_commercial_district": [
154
+ "After sunset, this commercial area comes alive with {illuminated_signage} and {evening_activities} under {colorful_lighting}.",
155
+ "The district's nighttime character is defined by {neon_elements} highlighting {storefront_features} amid {night_crowd_behavior}.",
156
+ "Evening transforms this zone through {light_displays} that accentuate {building_features} and frame {nightlife_activities}."
157
+ ],
158
+ "indoor_outdoor_cafe": [
159
+ "This cafe blends indoor comfort with outdoor atmosphere through {transitional_elements} connecting {indoor_features} with {outdoor_setting}.",
160
+ "Customers enjoy both {interior_amenities} and {exterior_features} in this space that bridges indoor comfort and outdoor ambiance.",
161
+ "The cafe design creates flow between {inside_elements} and {outside_spaces} allowing patrons to experience {dual_environment_benefits}."
162
+ ],
163
+ "transit_station_platform": [
164
+ "This transit platform combines covered areas with open sections where {passenger_activities} occur while awaiting {transportation_types}.",
165
+ "The station design balances {sheltered_elements} with {exposed_areas} for passengers engaged in {waiting_behaviors}.",
166
+ "Commuters navigate between {indoor_facilities} and {platform_features} while {transit_routines} unfold around arriving vehicles."
167
+ ],
168
+ "sports_stadium": [
169
+ "This athletic venue features {seating_arrangement} surrounding {playing_surface} where {sporting_activities} take place.",
170
+ "The stadium design incorporates {spectator_facilities} overlooking {competition_space} designed for {sports_events}.",
171
+ "Fans occupy {viewing_areas} arranged to maximize visibility of {field_elements} where athletes engage in {game_activities}."
172
+ ],
173
+ "construction_site": [
174
+ "This development area shows {construction_equipment} amid {building_materials} where workers conduct {construction_activities}.",
175
+ "The construction process is visible through {work_elements} positioned around {structural_components} in various stages of completion.",
176
+ "Workers utilize {site_equipment} to transform {raw_materials} following {construction_process} stages."
177
+ ],
178
+ "medical_facility": [
179
+ "This healthcare environment features {medical_elements} arranged to support {clinical_activities} in a {facility_design}.",
180
+ "The medical space incorporates {healthcare_features} where {patient_interactions} occur in a controlled environment.",
181
+ "Professional medical staff utilize {equipment_types} while conducting {care_procedures} in specialized {treatment_spaces}."
182
+ ],
183
+ "educational_setting": [
184
+ "This learning environment contains {educational_furniture} arranged to facilitate {learning_activities} through {instructional_design}.",
185
+ "The educational space features {classroom_elements} organized for {teaching_methods} and {student_engagement}.",
186
+ "Students and educators interact within {learning_spaces} equipped with {educational_tools} supporting {knowledge_transfer}."
187
+ ],
188
+ "beach_water_recreation": [
189
+ "A coastal recreation area with {beach_equipment} and people enjoying {water_activities}.",
190
+ "This shoreline space features {beach_equipment} where visitors engage in {water_activities}.",
191
+ "An outdoor water recreation zone with {beach_equipment} set up for {water_activities}."
192
+ ],
193
+ "sports_venue": [
194
+ "A professional sports facility with {sports_equipment} arranged for {competitive_activities}.",
195
+ "This athletics venue features {sports_equipment} with spaces designated for {competitive_activities}.",
196
+ "A specialized sports arena containing {sports_equipment} designed for {competitive_activities}."
197
+ ],
198
+ "professional_kitchen": [
199
+ "A commercial cooking space with {kitchen_equipment} organized for {food_preparation}.",
200
+ "This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
201
+ "An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
202
+ ],
203
+ }
scene_type.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ SCENE_TYPES = {
3
+ "living_room": {
4
+ "name": "Living Room",
5
+ "required_objects": [57, 62], # couch, tv
6
+ "optional_objects": [56, 60, 73, 75], # chair, dining table, book, vase
7
+ "minimum_required": 2,
8
+ "description": "A living room area with furniture for relaxation and entertainment"
9
+ },
10
+ "bedroom": {
11
+ "name": "Bedroom",
12
+ "required_objects": [59], # bed
13
+ "optional_objects": [56, 60, 73, 74, 75], # chair, dining table, book, clock, vase
14
+ "minimum_required": 1,
15
+ "description": "A bedroom with sleeping furniture"
16
+ },
17
+ "dining_area": {
18
+ "name": "Dining Area",
19
+ "required_objects": [60], # dining table
20
+ "optional_objects": [56, 39, 41, 42, 43, 44, 45], # chair, bottle, cup, fork, knife, spoon, bowl
21
+ "minimum_required": 1,
22
+ "description": "A dining area for meals"
23
+ },
24
+ "kitchen": {
25
+ "name": "Kitchen",
26
+ "required_objects": [72, 68, 69, 71], # refrigerator, microwave, oven, sink
27
+ "optional_objects": [39, 41, 42, 43, 44, 45], # bottle, cup, fork, knife, spoon, bowl
28
+ "minimum_required": 1,
29
+ "description": "A kitchen area for food preparation"
30
+ },
31
+ "office_workspace": {
32
+ "name": "Office Workspace",
33
+ "required_objects": [56, 63, 66, 64, 73], # chair, laptop, keyboard, mouse, book
34
+ "optional_objects": [60, 74, 75, 67], # dining table, clock, vase, cell phone
35
+ "minimum_required": 2,
36
+ "description": "A workspace with computer equipment for office work"
37
+ },
38
+ "meeting_room": {
39
+ "name": "Meeting Room",
40
+ "required_objects": [56, 60], # chair, dining table
41
+ "optional_objects": [63, 62, 67], # laptop, tv, cell phone
42
+ "minimum_required": 2,
43
+ "description": "A room set up for meetings with multiple seating"
44
+ },
45
+ "city_street": {
46
+ "name": "City Street",
47
+ "required_objects": [0, 1, 2, 3, 5, 7, 9], # person, bicycle, car, motorcycle, bus, truck, traffic light
48
+ "optional_objects": [10, 11, 12, 24, 25, 26, 28], # fire hydrant, stop sign, parking meter, backpack, umbrella, handbag, suitcase
49
+ "minimum_required": 2,
50
+ "description": "A city street with traffic and pedestrians"
51
+ },
52
+ "parking_lot": {
53
+ "name": "Parking Lot",
54
+ "required_objects": [2, 3, 5, 7], # car, motorcycle, bus, truck
55
+ "optional_objects": [0, 11, 12], # person, stop sign, parking meter
56
+ "minimum_required": 3,
57
+ "description": "A parking area with multiple vehicles"
58
+ },
59
+ "park_area": {
60
+ "name": "Park or Recreation Area",
61
+ "required_objects": [0, 13], # person, bench
62
+ "optional_objects": [1, 14, 16, 25, 33], # bicycle, bird, dog, umbrella, kite
63
+ "minimum_required": 2,
64
+ "description": "An outdoor recreational area for leisure activities"
65
+ },
66
+ "retail_store": {
67
+ "name": "Retail Store",
68
+ "required_objects": [0, 24, 26, 28], # person, backpack, handbag, suitcase
69
+ "optional_objects": [39, 45, 67], # bottle, bowl, cell phone
70
+ "minimum_required": 2,
71
+ "description": "A retail environment with shoppers and merchandise"
72
+ },
73
+ "supermarket": {
74
+ "name": "Supermarket",
75
+ "required_objects": [0, 24, 39, 46, 47, 49], # person, backpack, bottle, banana, apple, orange
76
+ "optional_objects": [26, 37, 45, 48, 51, 52, 53, 54, 55], # handbag, surfboard, bowl, sandwich, carrot, hot dog, pizza, donut, cake
77
+ "minimum_required": 3,
78
+ "description": "A supermarket with food items and shoppers"
79
+ },
80
+ "classroom": {
81
+ "name": "Classroom",
82
+ "required_objects": [56, 60, 73], # chair, dining table, book
83
+ "optional_objects": [63, 66, 67], # laptop, keyboard, cell phone
84
+ "minimum_required": 2,
85
+ "description": "A classroom environment set up for educational activities"
86
+ },
87
+ "conference_room": {
88
+ "name": "Conference Room",
89
+ "required_objects": [56, 60, 63], # chair, dining table, laptop
90
+ "optional_objects": [62, 67, 73], # tv, cell phone, book
91
+ "minimum_required": 2,
92
+ "description": "A conference room designed for meetings and presentations"
93
+ },
94
+ "cafe": {
95
+ "name": "Cafe",
96
+ "required_objects": [56, 60, 41], # chair, dining table, cup
97
+ "optional_objects": [39, 40, 63, 67, 73], # bottle, wine glass, laptop, cell phone, book
98
+ "minimum_required": 2,
99
+ "description": "A cafe setting with seating and beverages"
100
+ },
101
+ "library": {
102
+ "name": "Library",
103
+ "required_objects": [56, 60, 73], # chair, dining table, book
104
+ "optional_objects": [63, 67, 75], # laptop, cell phone, vase
105
+ "minimum_required": 2,
106
+ "description": "A library with books and reading areas"
107
+ },
108
+ "gym": {
109
+ "name": "Gym",
110
+ "required_objects": [0, 32], # person, sports ball
111
+ "optional_objects": [24, 25, 28, 38], # backpack, umbrella, suitcase, tennis racket
112
+ "minimum_required": 1,
113
+ "description": "A gym or fitness area for physical activities"
114
+ },
115
+ "beach": {
116
+ "name": "Beach",
117
+ "required_objects": [0, 25, 29, 33, 37], # person, umbrella, frisbee, kite, surfboard
118
+ "optional_objects": [1, 24, 26, 38], # bicycle, backpack, handbag, tennis racket
119
+ "minimum_required": 2,
120
+ "description": "A beach area with people and recreational items"
121
+ },
122
+ "restaurant": {
123
+ "name": "Restaurant",
124
+ "required_objects": [56, 60, 41, 42, 43, 44, 45], # chair, dining table, cup, fork, knife, spoon, bowl
125
+ "optional_objects": [39, 40, 48, 49, 50, 51, 52, 53, 54, 55], # bottle, wine glass, sandwich, orange, broccoli, carrot, hot dog, pizza, donut, cake
126
+ "minimum_required": 3,
127
+ "description": "A restaurant setting for dining with tables and eating utensils"
128
+ },
129
+ "train_station": {
130
+ "name": "Train Station",
131
+ "required_objects": [0, 6], # person, train
132
+ "optional_objects": [1, 2, 24, 28, 67], # bicycle, car, backpack, suitcase, cell phone
133
+ "minimum_required": 1,
134
+ "description": "A train station with train and passengers"
135
+ },
136
+ "airport": {
137
+ "name": "Airport",
138
+ "required_objects": [0, 4, 28], # person, airplane, suitcase
139
+ "optional_objects": [24, 25, 26, 67], # backpack, umbrella, handbag, cell phone
140
+ "minimum_required": 2,
141
+ "description": "An airport with planes and travelers carrying luggage"
142
+ },
143
+ "upscale_dining": {
144
+ "name": "Upscale Dining Area",
145
+ "required_objects": [56, 60, 40, 41], # chair, dining table, wine glass, cup
146
+ "optional_objects": [39, 42, 43, 44, 45, 62, 75], # bottle, fork, knife, spoon, bowl, tv, vase
147
+ "minimum_required": 2,
148
+ "description": "An elegantly designed dining space with refined furniture and decorative elements"
149
+ },
150
+ "asian_commercial_street": {
151
+ "name": "Asian Commercial Street",
152
+ "required_objects": [0, 67], # person, cell phone
153
+ "optional_objects": [1, 2, 3, 24, 25, 26, 28], # bicycle, car, motorcycle, backpack, umbrella, handbag, suitcase
154
+ "minimum_required": 1,
155
+ "description": "A bustling commercial street with shops, signage, and pedestrians in an Asian urban setting"
156
+ },
157
+ "financial_district": {
158
+ "name": "Financial District",
159
+ "required_objects": [2, 5, 7, 9], # car, bus, truck, traffic light
160
+ "optional_objects": [0, 1, 3, 8], # person, bicycle, motorcycle, boat
161
+ "minimum_required": 2,
162
+ "description": "A major thoroughfare in a business district with high-rise buildings and traffic"
163
+ },
164
+ "urban_intersection": {
165
+ "name": "Urban Intersection",
166
+ "required_objects": [0, 9], # person, traffic light
167
+ "optional_objects": [1, 2, 3, 5, 7], # bicycle, car, motorcycle, bus, truck
168
+ "minimum_required": 1,
169
+ "description": "A busy urban crossroad with pedestrian crossings and multiple traffic flows"
170
+ },
171
+ "transit_hub": {
172
+ "name": "Transit Hub",
173
+ "required_objects": [0, 5, 6, 7], # person, bus, train, truck
174
+ "optional_objects": [1, 2, 3, 9, 24, 28], # bicycle, car, motorcycle, traffic light, backpack, suitcase
175
+ "minimum_required": 2,
176
+ "description": "A transportation center where multiple modes of transit converge"
177
+ },
178
+ "shopping_district": {
179
+ "name": "Shopping District",
180
+ "required_objects": [0, 24, 26], # person, backpack, handbag
181
+ "optional_objects": [1, 2, 3, 25, 27, 28, 39, 67], # bicycle, car, motorcycle, umbrella, tie, suitcase, bottle, cell phone
182
+ "minimum_required": 2,
183
+ "description": "A retail-focused area with shops, pedestrians, and commercial activity"
184
+ },
185
+ "bus_stop": {
186
+ "name": "Bus Stop",
187
+ "required_objects": [0, 5], # person, bus
188
+ "optional_objects": [1, 2, 7, 24], # bicycle, car, truck, backpack
189
+ "minimum_required": 2,
190
+ "description": "A roadside bus stop with waiting passengers and buses"
191
+ },
192
+ "bus_station": {
193
+ "name": "Bus Station",
194
+ "required_objects": [0, 5, 7], # person, bus, truck
195
+ "optional_objects": [24, 28, 67], # backpack, suitcase, cell phone
196
+ "minimum_required": 2,
197
+ "description": "A bus terminal with multiple buses and travelers"
198
+ },
199
+ "zoo": {
200
+ "name": "Zoo",
201
+ "required_objects": [20, 22, 23], # elephant, zebra, giraffe
202
+ "optional_objects": [0, 14, 16], # person, bird, dog
203
+ "minimum_required": 2,
204
+ "description": "A zoo environment featuring large animal exhibits and visitors"
205
+ },
206
+ "harbor": {
207
+ "name": "Harbor",
208
+ "required_objects": [8], # boat
209
+ "optional_objects": [0, 2, 3, 39], # person, car, motorcycle, bottle
210
+ "minimum_required": 1,
211
+ "description": "A harbor area with boats docked and surrounding traffic"
212
+ },
213
+ "playground": {
214
+ "name": "Playground",
215
+ "required_objects": [0, 32], # person, sports ball
216
+ "optional_objects": [33, 24, 1], # kite, backpack, bicycle
217
+ "minimum_required": 1,
218
+ "description": "An outdoor playground with people playing sports and games"
219
+ },
220
+ "sports_field": {
221
+ "name": "Sports Field",
222
+ "required_objects": [32], # sports ball
223
+ "optional_objects": [38, 34, 35], # tennis racket, baseball bat, baseball glove
224
+ "minimum_required": 1,
225
+ "description": "A sports field set up for various ball games"
226
+ },
227
+ "narrow_commercial_alley": {
228
+ "name": "Narrow Commercial Alley",
229
+ "required_objects": [0, 3], # person, motorcycle
230
+ "optional_objects": [2, 7, 24, 26], # car, truck, backpack, handbag
231
+ "minimum_required": 2,
232
+ "description": "A tight urban alley lined with shops, with pedestrians and light vehicles"
233
+ },
234
+ "daytime_shopping_street": {
235
+ "name": "Daytime Shopping Street",
236
+ "required_objects": [0, 2], # person, car
237
+ "optional_objects": [1, 3, 24, 26], # bicycle, motorcycle, backpack, handbag
238
+ "minimum_required": 2,
239
+ "description": "A busy pedestrian street during daytime, featuring shops, vehicles, and shoppers"
240
+ },
241
+ "urban_pedestrian_crossing": {
242
+ "name": "Urban Pedestrian Crossing",
243
+ "required_objects": [0, 9], # person, traffic light
244
+ "optional_objects": [2, 3, 5], # car, motorcycle, bus
245
+ "minimum_required": 1,
246
+ "description": "A city street crossing with pedestrians and traffic signals"
247
+ },
248
+ "aerial_view_intersection": {
249
+ "name": "Aerial View Intersection",
250
+ "required_objects": [0, 9], # person, traffic light
251
+ "optional_objects": [1, 2, 3, 5, 7], # bicycle, car, motorcycle, bus, truck
252
+ "minimum_required": 1,
253
+ "description": "An intersection viewed from above, showing crossing patterns and pedestrian movement"
254
+ },
255
+ "aerial_view_commercial_area": {
256
+ "name": "Aerial View Commercial Area",
257
+ "required_objects": [0, 2], # person, car
258
+ "optional_objects": [1, 3, 5, 7, 24, 26], # bicycle, motorcycle, bus, truck, backpack, handbag
259
+ "minimum_required": 2,
260
+ "description": "A commercial or shopping area viewed from above showing pedestrians and urban layout"
261
+ },
262
+ "aerial_view_plaza": {
263
+ "name": "Aerial View Plaza",
264
+ "required_objects": [0], # person
265
+ "optional_objects": [1, 2, 24, 25, 26], # bicycle, car, backpack, umbrella, handbag
266
+ "minimum_required": 1,
267
+ "description": "An urban plaza or public square viewed from above with pedestrian activity"
268
+ },
269
+
270
+ # specific cultural item
271
+ "asian_night_market": {
272
+ "name": "Asian Night Market",
273
+ "required_objects": [0, 67], # person, cell phone
274
+ "optional_objects": [1, 3, 24, 26, 39, 41], # bicycle, motorcycle, backpack, handbag, bottle, cup
275
+ "minimum_required": 1,
276
+ "description": "A vibrant night market scene typical in Asian cities with food stalls and crowds"
277
+ },
278
+ "asian_temple_area": {
279
+ "name": "Asian Temple Area",
280
+ "required_objects": [0], # person
281
+ "optional_objects": [24, 25, 26, 67, 75], # backpack, umbrella, handbag, cell phone, vase
282
+ "minimum_required": 1,
283
+ "description": "A traditional Asian temple complex with visitors and cultural elements"
284
+ },
285
+ "european_plaza": {
286
+ "name": "European Plaza",
287
+ "required_objects": [0], # person
288
+ "optional_objects": [1, 2, 4, 9, 24, 26, 67], # bicycle, car, airplane, traffic light, backpack, handbag, cell phone
289
+ "minimum_required": 1,
290
+ "description": "A European-style city plaza with historic architecture and pedestrian activity"
291
+ },
292
+
293
+ # specific time item
294
+ "nighttime_street": {
295
+ "name": "Nighttime Street",
296
+ "required_objects": [0, 9], # person, traffic light
297
+ "optional_objects": [1, 2, 3, 5, 7, 67], # bicycle, car, motorcycle, bus, truck, cell phone
298
+ "minimum_required": 1,
299
+ "description": "An urban street at night with artificial lighting and nighttime activity"
300
+ },
301
+ "nighttime_commercial_district": {
302
+ "name": "Nighttime Commercial District",
303
+ "required_objects": [0, 67], # person, cell phone
304
+ "optional_objects": [1, 2, 3, 24, 26], # bicycle, car, motorcycle, backpack, handbag
305
+ "minimum_required": 1,
306
+ "description": "A commercial district illuminated at night with neon signs and evening activity"
307
+ },
308
+
309
+ # mixture enviroment item
310
+ "indoor_outdoor_cafe": {
311
+ "name": "Indoor-Outdoor Cafe",
312
+ "required_objects": [56, 60, 41], # chair, dining table, cup
313
+ "optional_objects": [39, 40, 63, 67, 73], # bottle, wine glass, laptop, cell phone, book
314
+ "minimum_required": 2,
315
+ "description": "A cafe setting with both indoor elements and outdoor patio or sidewalk seating"
316
+ },
317
+ "transit_station_platform": {
318
+ "name": "Transit Station Platform",
319
+ "required_objects": [0], # person
320
+ "optional_objects": [5, 6, 7, 24, 28, 67], # bus, train, truck, backpack, suitcase, cell phone
321
+ "minimum_required": 1,
322
+ "description": "A transit platform with waiting passengers and arriving/departing vehicles"
323
+ },
324
+ "sports_stadium": {
325
+ "name": "Sports Stadium",
326
+ "required_objects": [0, 32], # person, sports ball
327
+ "optional_objects": [24, 38, 39, 41, 67], # backpack, tennis racket, bottle, cup, cell phone
328
+ "minimum_required": 1,
329
+ "description": "A sports stadium or arena with spectators and athletic activities"
330
+ },
331
+ "construction_site": {
332
+ "name": "Construction Site",
333
+ "required_objects": [0, 7], # person, truck
334
+ "optional_objects": [2, 3, 11, 76, 77, 78], # car, motorcycle, fire hydrant, scissors, teddy bear, hair drier
335
+ "minimum_required": 1,
336
+ "description": "A construction site with workers, equipment, and building materials"
337
+ },
338
+ "medical_facility": {
339
+ "name": "Medical Facility",
340
+ "required_objects": [0, 56, 60], # person, chair, dining table
341
+ "optional_objects": [63, 64, 66, 67, 73], # laptop, mouse, keyboard, cell phone, book
342
+ "minimum_required": 2,
343
+ "description": "A medical facility such as hospital, clinic or doctor's office with medical staff and patients"
344
+ },
345
+ "educational_setting": {
346
+ "name": "Educational Setting",
347
+ "required_objects": [0, 56, 60, 73], # person, chair, dining table, book
348
+ "optional_objects": [63, 64, 66, 67, 74], # laptop, mouse, keyboard, cell phone, clock
349
+ "minimum_required": 2,
350
+ "description": "An educational environment such as classroom, lecture hall or study area"
351
+ },
352
+ "aerial_view_intersection": {
353
+ "name": "Aerial View Intersection",
354
+ "required_objects": [0, 9], # person, traffic light
355
+ "optional_objects": [1, 2, 3, 5, 7], # bicycle, car, motorcycle, bus, truck
356
+ "minimum_required": 1,
357
+ "description": "An intersection viewed from above, showing crossing patterns and pedestrian movement",
358
+ "viewpoint_indicator": "aerial", # view side
359
+ "key_features": ["crosswalk_pattern", "pedestrian_flow", "intersection_layout"], # key feature
360
+ "detection_priority": 10 # priority
361
+ },
362
+ "perpendicular_crosswalk_intersection": {
363
+ "name": "Perpendicular Crosswalk Intersection",
364
+ "required_objects": [0], # person
365
+ "optional_objects": [1, 2, 3, 5, 7, 9], # bicycle, car, motorcycle, bus, truck, traffic light
366
+ "minimum_required": 1,
367
+ "description": "An intersection with perpendicular crosswalks where pedestrians cross in multiple directions",
368
+ "viewpoint_indicator": "aerial",
369
+ "key_features": ["perpendicular_crosswalks", "pedestrian_crossing", "multi_directional_movement"],
370
+ "pattern_detection": True, # specific pattern
371
+ "detection_priority": 15 #
372
+ },
373
+ "beach_water_recreation": {
374
+ "name": "Beach/Water Recreation Area",
375
+ "required_objects": [0, 37], # person, surfboard
376
+ "optional_objects": [25, 33, 1, 8, 29, 24, 26, 39, 41], # umbrella, kite, bicycle, boat, frisbee, backpack, handbag, bottle, cup
377
+ "minimum_required": 2,
378
+ "description": "A beach or water recreation area with water sports equipment and beach accessories"
379
+ },
380
+ "sports_venue": {
381
+ "name": "Sports Venue",
382
+ "required_objects": [0, 32], # person, sports ball
383
+ "optional_objects": [34, 35, 38, 25, 24, 26, 39, 41], # baseball bat, baseball glove, tennis racket, umbrella, backpack, handbag, bottle, cup
384
+ "minimum_required": 2,
385
+ "description": "A professional sports venue with specialized sports equipment and spectator areas"
386
+ },
387
+ "professional_kitchen": {
388
+ "name": "Professional Kitchen",
389
+ "required_objects": [43, 44, 45], # knife, spoon, bowl
390
+ "optional_objects": [42, 39, 41, 68, 69, 71, 72, 0], # fork, bottle, cup, microwave, oven, sink, refrigerator, person
391
+ "minimum_required": 3,
392
+ "description": "A commercial kitchen with professional cooking equipment and food preparation areas"
393
+ },
394
+ }
spatial_analyzer.py ADDED
@@ -0,0 +1,1444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import numpy as np
4
+ from typing import Dict, List, Tuple, Any, Optional
5
+
6
+ from scene_type import SCENE_TYPES
7
+ from enhance_descriptor import EnhancedSceneDescriber
8
+
9
+ class SpatialAnalyzer:
10
+ """
11
+ Analyzes spatial relationships between objects in an image.
12
+ Handles region assignment, object positioning, and functional zone identification.
13
+ """
14
+
15
+ def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
16
+ """Initialize the spatial analyzer with image regions"""
17
+ # Define regions of the image (3x3 grid)
18
+ self.regions = {
19
+ "top_left": (0, 0, 1/3, 1/3),
20
+ "top_center": (1/3, 0, 2/3, 1/3),
21
+ "top_right": (2/3, 0, 1, 1/3),
22
+ "middle_left": (0, 1/3, 1/3, 2/3),
23
+ "middle_center": (1/3, 1/3, 2/3, 2/3),
24
+ "middle_right": (2/3, 1/3, 1, 2/3),
25
+ "bottom_left": (0, 2/3, 1/3, 1),
26
+ "bottom_center": (1/3, 2/3, 2/3, 1),
27
+ "bottom_right": (2/3, 2/3, 1, 1)
28
+ }
29
+
30
+ self.class_names = class_names
31
+ self.OBJECT_CATEGORIES = object_categories or {}
32
+ self.enhance_descriptor = EnhancedSceneDescriber(scene_types=SCENE_TYPES)
33
+
34
+ # Distances thresholds for proximity analysis (normalized)
35
+ self.proximity_threshold = 0.2
36
+
37
+
38
+ def _determine_region(self, x: float, y: float) -> str:
39
+ """
40
+ Determine which region a point falls into.
41
+
42
+ Args:
43
+ x: Normalized x-coordinate (0-1)
44
+ y: Normalized y-coordinate (0-1)
45
+
46
+ Returns:
47
+ Region name
48
+ """
49
+ for region_name, (x1, y1, x2, y2) in self.regions.items():
50
+ if x1 <= x < x2 and y1 <= y < y2:
51
+ return region_name
52
+
53
+ return "unknown"
54
+
55
+ def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
56
+ """
57
+ Analyze object distribution across image regions.
58
+
59
+ Args:
60
+ detected_objects: List of detected objects with position information
61
+
62
+ Returns:
63
+ Dictionary with region analysis
64
+ """
65
+ # Count objects in each region
66
+ region_counts = {region: 0 for region in self.regions.keys()}
67
+ region_objects = {region: [] for region in self.regions.keys()}
68
+
69
+ for obj in detected_objects:
70
+ region = obj["region"]
71
+ if region in region_counts:
72
+ region_counts[region] += 1
73
+ region_objects[region].append({
74
+ "class_id": obj["class_id"],
75
+ "class_name": obj["class_name"]
76
+ })
77
+
78
+ # Determine main focus regions (top 1-2 regions by object count)
79
+ sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
80
+ main_regions = [region for region, count in sorted_regions if count > 0][:2]
81
+
82
+ return {
83
+ "counts": region_counts,
84
+ "main_focus": main_regions,
85
+ "objects_by_region": region_objects
86
+ }
87
+
88
+ def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
89
+ """
90
+ Extract detected objects from detection result with position information.
91
+
92
+ Args:
93
+ detection_result: Detection result from YOLOv8
94
+ confidence_threshold: Minimum confidence threshold
95
+
96
+ Returns:
97
+ List of dictionaries with detected object information
98
+ """
99
+ boxes = detection_result.boxes.xyxy.cpu().numpy()
100
+ classes = detection_result.boxes.cls.cpu().numpy().astype(int)
101
+ confidences = detection_result.boxes.conf.cpu().numpy()
102
+
103
+ # Image dimensions
104
+ img_height, img_width = detection_result.orig_shape[:2]
105
+
106
+ detected_objects = []
107
+ for box, class_id, confidence in zip(boxes, classes, confidences):
108
+ # Skip objects with confidence below threshold
109
+ if confidence < confidence_threshold:
110
+ continue
111
+
112
+ x1, y1, x2, y2 = box
113
+ width = x2 - x1
114
+ height = y2 - y1
115
+
116
+ # Center point
117
+ center_x = (x1 + x2) / 2
118
+ center_y = (y1 + y2) / 2
119
+
120
+ # Normalized positions (0-1)
121
+ norm_x = center_x / img_width
122
+ norm_y = center_y / img_height
123
+ norm_width = width / img_width
124
+ norm_height = height / img_height
125
+
126
+ # Area calculation
127
+ area = width * height
128
+ norm_area = area / (img_width * img_height)
129
+
130
+ # Region determination
131
+ object_region = self._determine_region(norm_x, norm_y)
132
+
133
+ detected_objects.append({
134
+ "class_id": int(class_id),
135
+ "class_name": self.class_names[int(class_id)],
136
+ "confidence": float(confidence),
137
+ "box": [float(x1), float(y1), float(x2), float(y2)],
138
+ "center": [float(center_x), float(center_y)],
139
+ "normalized_center": [float(norm_x), float(norm_y)],
140
+ "size": [float(width), float(height)],
141
+ "normalized_size": [float(norm_width), float(norm_height)],
142
+ "area": float(area),
143
+ "normalized_area": float(norm_area),
144
+ "region": object_region
145
+ })
146
+
147
+ return detected_objects
148
+
149
+
150
+ def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
151
+ """
152
+ 檢測場景視角並識別特殊場景模式。
153
+
154
+ Args:
155
+ detected_objects: 檢測到的物體列表
156
+
157
+ Returns:
158
+ Dict: 包含視角和場景模式信息的字典
159
+ """
160
+ if not detected_objects:
161
+ return {"viewpoint": "eye_level", "patterns": []}
162
+
163
+ # 從物體位置中提取信息
164
+ patterns = []
165
+
166
+ # 檢測行人位置模式
167
+ pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
168
+
169
+ # 檢查是否有足夠的行人來識別模式
170
+ if len(pedestrian_objs) >= 4:
171
+ pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
172
+
173
+ # 檢測十字交叉模式
174
+ if self._detect_cross_pattern(pedestrian_positions):
175
+ patterns.append("crosswalk_intersection")
176
+
177
+ # 檢測多方向行人流
178
+ directions = self._analyze_movement_directions(pedestrian_positions)
179
+ if len(directions) >= 2:
180
+ patterns.append("multi_directional_movement")
181
+
182
+ # 檢查物體的大小一致性 - 在空中俯視圖中,物體大小通常更一致
183
+ if len(detected_objects) >= 5:
184
+ sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
185
+ size_variance = np.var(sizes) / (np.mean(sizes) ** 2) # 標準化變異數,不會受到平均值影響
186
+
187
+ if size_variance < 0.3: # 低變異表示大小一致
188
+ patterns.append("consistent_object_size")
189
+
190
+ # 基本視角檢測
191
+ viewpoint = self.enhance_descriptor._detect_viewpoint(detected_objects)
192
+
193
+ # 根據檢測到的模式增強視角判斷
194
+ if "crosswalk_intersection" in patterns and viewpoint != "aerial":
195
+ # 如果檢測到斑馬線交叉但視角判斷不是空中視角,優先採用模式判斷
196
+ viewpoint = "aerial"
197
+
198
+ return {
199
+ "viewpoint": viewpoint,
200
+ "patterns": patterns
201
+ }
202
+
203
+ def _detect_cross_pattern(self, positions):
204
+ """
205
+ 檢測位置中的十字交叉模式
206
+
207
+ Args:
208
+ positions: 位置列表 [[x1, y1], [x2, y2], ...]
209
+
210
+ Returns:
211
+ bool: 是否檢測到十字交叉模式
212
+ """
213
+ if len(positions) < 8: # 需要足夠多的點
214
+ return False
215
+
216
+ # 提取 x 和 y 坐標
217
+ x_coords = [pos[0] for pos in positions]
218
+ y_coords = [pos[1] for pos in positions]
219
+
220
+ # 檢測 x 和 y 方向的聚類
221
+ x_clusters = []
222
+ y_clusters = []
223
+
224
+ # 簡化的聚類分析
225
+ x_mean = np.mean(x_coords)
226
+ y_mean = np.mean(y_coords)
227
+
228
+ # 計算在中心線附近的點
229
+ near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1)
230
+ near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1)
231
+
232
+ # 如果有足夠的點在中心線附近,可能是十字交叉
233
+ return near_x_center >= 3 and near_y_center >= 3
234
+
235
+ def _analyze_movement_directions(self, positions):
236
+ """
237
+ 分析位置中的移動方向
238
+
239
+ Args:
240
+ positions: 位置列表 [[x1, y1], [x2, y2], ...]
241
+
242
+ Returns:
243
+ list: 檢測到的主要方向
244
+ """
245
+ if len(positions) < 6:
246
+ return []
247
+
248
+ # extract x 和 y 坐標
249
+ x_coords = [pos[0] for pos in positions]
250
+ y_coords = [pos[1] for pos in positions]
251
+
252
+ directions = []
253
+
254
+ # horizontal move (left --> right)
255
+ x_std = np.std(x_coords)
256
+ x_range = max(x_coords) - min(x_coords)
257
+
258
+ # vertical move(up --> down)
259
+ y_std = np.std(y_coords)
260
+ y_range = max(y_coords) - min(y_coords)
261
+
262
+ # 足夠大的範圍表示該方向有運動
263
+ if x_range > 0.4:
264
+ directions.append("horizontal")
265
+ if y_range > 0.4:
266
+ directions.append("vertical")
267
+
268
+ return directions
269
+
270
+ def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
271
+ """
272
+ Identify functional zones within the scene with improved detection for different viewpoints
273
+ and cultural contexts.
274
+
275
+ Args:
276
+ detected_objects: List of detected objects
277
+ scene_type: Identified scene type
278
+
279
+ Returns:
280
+ Dictionary of functional zones with their descriptions
281
+ """
282
+ # Group objects by category and region
283
+ category_regions = {}
284
+
285
+ for obj in detected_objects:
286
+ # Find object category
287
+ category = "other"
288
+ for cat_name, cat_ids in self.OBJECT_CATEGORIES.items():
289
+ if obj["class_id"] in cat_ids:
290
+ category = cat_name
291
+ break
292
+
293
+ # Add to category-region mapping
294
+ if category not in category_regions:
295
+ category_regions[category] = {}
296
+
297
+ region = obj["region"]
298
+ if region not in category_regions[category]:
299
+ category_regions[category][region] = []
300
+
301
+ category_regions[category][region].append(obj)
302
+
303
+ # Identify zones based on object groupings
304
+ zones = {}
305
+
306
+ # Detect viewpoint to adjust zone identification strategy
307
+ viewpoint = self._detect_scene_viewpoint(detected_objects)
308
+
309
+ # Choose appropriate zone identification strategy based on scene type and viewpoint
310
+ if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
311
+ # Indoor scenes
312
+ zones.update(self._identify_indoor_zones(category_regions, detected_objects, scene_type))
313
+ elif scene_type in ["city_street", "parking_lot", "park_area"]:
314
+ # Outdoor general scenes
315
+ zones.update(self._identify_outdoor_general_zones(category_regions, detected_objects, scene_type))
316
+ elif "aerial" in scene_type or viewpoint == "aerial":
317
+ # Aerial viewpoint scenes
318
+ zones.update(self._identify_aerial_view_zones(category_regions, detected_objects, scene_type))
319
+ elif "asian" in scene_type:
320
+ # Asian cultural context scenes
321
+ zones.update(self._identify_asian_cultural_zones(category_regions, detected_objects, scene_type))
322
+ elif scene_type == "urban_intersection":
323
+ # Specific urban intersection logic
324
+ zones.update(self._identify_intersection_zones(category_regions, detected_objects, viewpoint))
325
+ elif scene_type == "financial_district":
326
+ # Financial district specific logic
327
+ zones.update(self._identify_financial_district_zones(category_regions, detected_objects))
328
+ elif scene_type == "upscale_dining":
329
+ # Upscale dining specific logic
330
+ zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
331
+ else:
332
+ # Default zone identification for other scene types
333
+ zones.update(self._identify_default_zones(category_regions, detected_objects))
334
+
335
+ # If no zones were identified, try the default approach
336
+ if not zones:
337
+ zones.update(self._identify_default_zones(category_regions, detected_objects))
338
+
339
+ return zones
340
+
341
+ def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
342
+ """
343
+ Identify functional zones for indoor scenes.
344
+
345
+ Args:
346
+ category_regions: Objects grouped by category and region
347
+ detected_objects: List of detected objects
348
+ scene_type: Specific indoor scene type
349
+
350
+ Returns:
351
+ Dict: Indoor functional zones
352
+ """
353
+ zones = {}
354
+
355
+ # Seating/social zone
356
+ if "furniture" in category_regions:
357
+ furniture_regions = category_regions["furniture"]
358
+ main_furniture_region = max(furniture_regions.items(),
359
+ key=lambda x: len(x[1]),
360
+ default=(None, []))
361
+
362
+ if main_furniture_region[0] is not None and len(main_furniture_region[1]) >= 2:
363
+ zone_objects = [obj["class_name"] for obj in main_furniture_region[1]]
364
+ zones["social_zone"] = {
365
+ "region": main_furniture_region[0],
366
+ "objects": zone_objects,
367
+ "description": f"Social or seating area with {', '.join(zone_objects)}"
368
+ }
369
+
370
+ # Entertainment zone
371
+ if "electronics" in category_regions:
372
+ electronics_items = []
373
+ for region_objects in category_regions["electronics"].values():
374
+ electronics_items.extend([obj["class_name"] for obj in region_objects])
375
+
376
+ if electronics_items:
377
+ zones["entertainment_zone"] = {
378
+ "region": self._find_main_region(category_regions.get("electronics", {})),
379
+ "objects": electronics_items,
380
+ "description": f"Entertainment or media area with {', '.join(electronics_items)}"
381
+ }
382
+
383
+ # Dining/food zone
384
+ food_zone_categories = ["kitchen_items", "food"]
385
+ food_items = []
386
+ food_regions = {}
387
+
388
+ for category in food_zone_categories:
389
+ if category in category_regions:
390
+ for region, objects in category_regions[category].items():
391
+ if region not in food_regions:
392
+ food_regions[region] = []
393
+ food_regions[region].extend(objects)
394
+ food_items.extend([obj["class_name"] for obj in objects])
395
+
396
+ if food_items:
397
+ main_food_region = max(food_regions.items(),
398
+ key=lambda x: len(x[1]),
399
+ default=(None, []))
400
+
401
+ if main_food_region[0] is not None:
402
+ zones["dining_zone"] = {
403
+ "region": main_food_region[0],
404
+ "objects": list(set(food_items)),
405
+ "description": f"Dining or food preparation area with {', '.join(list(set(food_items))[:3])}"
406
+ }
407
+
408
+ # Work/study zone - enhanced to detect even when scene_type is not explicitly office
409
+ work_items = []
410
+ work_regions = {}
411
+
412
+ for obj in detected_objects:
413
+ if obj["class_id"] in [56, 60, 63, 64, 66, 73]: # chair, table, laptop, mouse, keyboard, book
414
+ region = obj["region"]
415
+ if region not in work_regions:
416
+ work_regions[region] = []
417
+ work_regions[region].append(obj)
418
+ work_items.append(obj["class_name"])
419
+
420
+ # Check for laptop and table/chair combinations that suggest a workspace
421
+ has_laptop = any(obj["class_id"] == 63 for obj in detected_objects)
422
+ has_keyboard = any(obj["class_id"] == 66 for obj in detected_objects)
423
+ has_table = any(obj["class_id"] == 60 for obj in detected_objects)
424
+ has_chair = any(obj["class_id"] == 56 for obj in detected_objects)
425
+
426
+ # If we have electronics with furniture in the same region, likely a workspace
427
+ workspace_detected = (has_laptop or has_keyboard) and (has_table or has_chair)
428
+
429
+ if (workspace_detected or scene_type in ["office_workspace", "meeting_room"]) and work_items:
430
+ main_work_region = max(work_regions.items(),
431
+ key=lambda x: len(x[1]),
432
+ default=(None, []))
433
+
434
+ if main_work_region[0] is not None:
435
+ zones["workspace_zone"] = {
436
+ "region": main_work_region[0],
437
+ "objects": list(set(work_items)),
438
+ "description": f"Work or study area with {', '.join(list(set(work_items))[:3])}"
439
+ }
440
+
441
+ # Bedroom-specific zones
442
+ if scene_type == "bedroom":
443
+ bed_objects = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed
444
+ if bed_objects:
445
+ bed_region = bed_objects[0]["region"]
446
+ zones["sleeping_zone"] = {
447
+ "region": bed_region,
448
+ "objects": ["bed"],
449
+ "description": "Sleeping area with bed"
450
+ }
451
+
452
+ # Kitchen-specific zones
453
+ if scene_type == "kitchen":
454
+ # Look for appliances (refrigerator, oven, microwave, sink)
455
+ appliance_ids = [68, 69, 71, 72] # microwave, oven, sink, refrigerator
456
+ appliance_objects = [obj for obj in detected_objects if obj["class_id"] in appliance_ids]
457
+
458
+ if appliance_objects:
459
+ appliance_regions = {}
460
+ for obj in appliance_objects:
461
+ region = obj["region"]
462
+ if region not in appliance_regions:
463
+ appliance_regions[region] = []
464
+ appliance_regions[region].append(obj)
465
+
466
+ if appliance_regions:
467
+ main_appliance_region = max(appliance_regions.items(),
468
+ key=lambda x: len(x[1]),
469
+ default=(None, []))
470
+
471
+ if main_appliance_region[0] is not None:
472
+ appliance_names = [obj["class_name"] for obj in main_appliance_region[1]]
473
+ zones["kitchen_appliance_zone"] = {
474
+ "region": main_appliance_region[0],
475
+ "objects": appliance_names,
476
+ "description": f"Kitchen appliance area with {', '.join(appliance_names)}"
477
+ }
478
+
479
+ return zones
480
+
481
+ def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
482
+ """
483
+ Identify functional zones for urban intersections with enhanced spatial awareness.
484
+
485
+ Args:
486
+ category_regions: Objects grouped by category and region
487
+ detected_objects: List of detected objects
488
+ viewpoint: Detected viewpoint
489
+
490
+ Returns:
491
+ Dict: Refined intersection functional zones
492
+ """
493
+ zones = {}
494
+
495
+ # Get pedestrians, vehicles and traffic signals
496
+ pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
497
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]] # bicycle, car, motorcycle, bus, truck
498
+ traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
499
+
500
+ # Create distribution maps for better spatial understanding
501
+ regions_distribution = self._create_distribution_map(detected_objects)
502
+
503
+ # Analyze pedestrian crossing patterns
504
+ crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs, regions_distribution)
505
+ zones.update(crossing_zones)
506
+
507
+ # Analyze vehicle traffic zones with directional awareness
508
+ traffic_zones = self._analyze_traffic_zones(vehicle_objs, regions_distribution)
509
+ zones.update(traffic_zones)
510
+
511
+ # Identify traffic control zones based on signal placement
512
+ if traffic_light_objs:
513
+ # Group traffic lights by region for better organization
514
+ signal_regions = {}
515
+ for obj in traffic_light_objs:
516
+ region = obj["region"]
517
+ if region not in signal_regions:
518
+ signal_regions[region] = []
519
+ signal_regions[region].append(obj)
520
+
521
+ # Create traffic control zones for each region with signals
522
+ for idx, (region, signals) in enumerate(signal_regions.items()):
523
+ # Check if this region has a directional name
524
+ direction = self._get_directional_description(region)
525
+
526
+ zones[f"traffic_control_zone_{idx+1}"] = {
527
+ "region": region,
528
+ "objects": ["traffic light"] * len(signals),
529
+ "description": f"Traffic control area with {len(signals)} traffic signals" +
530
+ (f" in {direction} area" if direction else "")
531
+ }
532
+
533
+ return zones
534
+
535
+ def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
536
+ region_distribution: Dict) -> Dict:
537
+ """
538
+ Analyze pedestrian crossing patterns to identify crosswalk zones.
539
+
540
+ Args:
541
+ pedestrians: List of pedestrian objects
542
+ traffic_lights: List of traffic light objects
543
+ region_distribution: Distribution of objects by region
544
+
545
+ Returns:
546
+ Dict: Identified crossing zones
547
+ """
548
+ crossing_zones = {}
549
+
550
+ if not pedestrians:
551
+ return crossing_zones
552
+
553
+ # Group pedestrians by region
554
+ pedestrian_regions = {}
555
+ for p in pedestrians:
556
+ region = p["region"]
557
+ if region not in pedestrian_regions:
558
+ pedestrian_regions[region] = []
559
+ pedestrian_regions[region].append(p)
560
+
561
+ # Sort regions by pedestrian count to find main crossing areas
562
+ sorted_regions = sorted(pedestrian_regions.items(), key=lambda x: len(x[1]), reverse=True)
563
+
564
+ # Create crossing zones for regions with pedestrians
565
+ for idx, (region, peds) in enumerate(sorted_regions[:2]): # Focus on top 2 regions
566
+ # Check if there are traffic lights nearby to indicate a crosswalk
567
+ has_nearby_signals = any(t["region"] == region for t in traffic_lights)
568
+
569
+ # Create crossing zone with descriptive naming
570
+ zone_name = f"crossing_zone_{idx+1}"
571
+ direction = self._get_directional_description(region)
572
+
573
+ description = f"Pedestrian crossing area with {len(peds)} "
574
+ description += "person" if len(peds) == 1 else "people"
575
+ if direction:
576
+ description += f" in {direction} direction"
577
+ if has_nearby_signals:
578
+ description += " near traffic signals"
579
+
580
+ crossing_zones[zone_name] = {
581
+ "region": region,
582
+ "objects": ["pedestrian"] * len(peds),
583
+ "description": description
584
+ }
585
+
586
+ return crossing_zones
587
+
588
+ def _analyze_traffic_zones(self, vehicles: List[Dict], region_distribution: Dict) -> Dict:
589
+ """
590
+ Analyze vehicle distribution to identify traffic zones with directional awareness.
591
+
592
+ Args:
593
+ vehicles: List of vehicle objects
594
+ region_distribution: Distribution of objects by region
595
+
596
+ Returns:
597
+ Dict: Identified traffic zones
598
+ """
599
+ traffic_zones = {}
600
+
601
+ if not vehicles:
602
+ return traffic_zones
603
+
604
+ # Group vehicles by region
605
+ vehicle_regions = {}
606
+ for v in vehicles:
607
+ region = v["region"]
608
+ if region not in vehicle_regions:
609
+ vehicle_regions[region] = []
610
+ vehicle_regions[region].append(v)
611
+
612
+ # Create traffic zones for regions with vehicles
613
+ main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))
614
+
615
+ if main_traffic_region[0] is not None:
616
+ region = main_traffic_region[0]
617
+ vehicles_in_region = main_traffic_region[1]
618
+
619
+ # Get a list of vehicle types for description
620
+ vehicle_types = [v["class_name"] for v in vehicles_in_region]
621
+ unique_types = list(set(vehicle_types))
622
+
623
+ # Get directional description
624
+ direction = self._get_directional_description(region)
625
+
626
+ # Create descriptive zone
627
+ traffic_zones["vehicle_zone"] = {
628
+ "region": region,
629
+ "objects": vehicle_types,
630
+ "description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
631
+ (f" in {direction} area" if direction else "")
632
+ }
633
+
634
+ # If vehicles are distributed across multiple regions, create secondary zones
635
+ if len(vehicle_regions) > 1:
636
+ # Get second most populated region
637
+ sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
638
+ if len(sorted_regions) > 1:
639
+ second_region, second_vehicles = sorted_regions[1]
640
+ direction = self._get_directional_description(second_region)
641
+ vehicle_types = [v["class_name"] for v in second_vehicles]
642
+ unique_types = list(set(vehicle_types))
643
+
644
+ traffic_zones["secondary_vehicle_zone"] = {
645
+ "region": second_region,
646
+ "objects": vehicle_types,
647
+ "description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
648
+ (f" in {direction} direction" if direction else "")
649
+ }
650
+
651
+ return traffic_zones
652
+
653
+ def _get_directional_description(self, region: str) -> str:
654
+ """
655
+ Convert region name to a directional description.
656
+
657
+ Args:
658
+ region: Region name from the grid
659
+
660
+ Returns:
661
+ str: Directional description
662
+ """
663
+ if "top" in region and "left" in region:
664
+ return "northwest"
665
+ elif "top" in region and "right" in region:
666
+ return "northeast"
667
+ elif "bottom" in region and "left" in region:
668
+ return "southwest"
669
+ elif "bottom" in region and "right" in region:
670
+ return "southeast"
671
+ elif "top" in region:
672
+ return "north"
673
+ elif "bottom" in region:
674
+ return "south"
675
+ elif "left" in region:
676
+ return "west"
677
+ elif "right" in region:
678
+ return "east"
679
+ else:
680
+ return "central"
681
+
682
+ def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
683
+ """
684
+ Create a distribution map of objects across regions for spatial analysis.
685
+
686
+ Args:
687
+ detected_objects: List of detected objects
688
+
689
+ Returns:
690
+ Dict: Distribution map of objects by region and class
691
+ """
692
+ distribution = {}
693
+
694
+ # Initialize all regions
695
+ for region in self.regions.keys():
696
+ distribution[region] = {
697
+ "total": 0,
698
+ "objects": {},
699
+ "density": 0
700
+ }
701
+
702
+ # Populate the distribution
703
+ for obj in detected_objects:
704
+ region = obj["region"]
705
+ class_id = obj["class_id"]
706
+ class_name = obj["class_name"]
707
+
708
+ distribution[region]["total"] += 1
709
+
710
+ if class_id not in distribution[region]["objects"]:
711
+ distribution[region]["objects"][class_id] = {
712
+ "name": class_name,
713
+ "count": 0,
714
+ "positions": []
715
+ }
716
+
717
+ distribution[region]["objects"][class_id]["count"] += 1
718
+
719
+ # Store position for spatial relationship analysis
720
+ if "normalized_center" in obj:
721
+ distribution[region]["objects"][class_id]["positions"].append(obj["normalized_center"])
722
+
723
+ # Calculate object density for each region
724
+ for region, data in distribution.items():
725
+ # Assuming all regions are equal size in the grid
726
+ data["density"] = data["total"] / 1
727
+
728
+ return distribution
729
+
730
+ def _identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
731
+ """
732
+ Identify functional zones for scenes with Asian cultural context.
733
+
734
+ Args:
735
+ category_regions: Objects grouped by category and region
736
+ detected_objects: List of detected objects
737
+ scene_type: Specific scene type
738
+
739
+ Returns:
740
+ Dict: Asian cultural functional zones
741
+ """
742
+ zones = {}
743
+
744
+ # Identify storefront zone
745
+ storefront_items = []
746
+ storefront_regions = {}
747
+
748
+ # Since storefronts aren't directly detectable, infer from context
749
+ # For example, look for regions with signs, people, and smaller objects
750
+ sign_regions = set()
751
+ for obj in detected_objects:
752
+ if obj["class_id"] == 0: # Person
753
+ region = obj["region"]
754
+ if region not in storefront_regions:
755
+ storefront_regions[region] = []
756
+ storefront_regions[region].append(obj)
757
+
758
+ # Add regions with people as potential storefront areas
759
+ sign_regions.add(region)
760
+
761
+ # Use the areas with most people as storefront zones
762
+ if storefront_regions:
763
+ main_storefront_regions = sorted(storefront_regions.items(),
764
+ key=lambda x: len(x[1]),
765
+ reverse=True)[:2] # Top 2 regions
766
+
767
+ for idx, (region, objs) in enumerate(main_storefront_regions):
768
+ zones[f"commercial_zone_{idx+1}"] = {
769
+ "region": region,
770
+ "objects": [obj["class_name"] for obj in objs],
771
+ "description": f"Asian commercial storefront with pedestrian activity"
772
+ }
773
+
774
+ # Identify pedestrian pathway - enhanced to better detect linear pathways
775
+ pathway_items = []
776
+ pathway_regions = {}
777
+
778
+ # Extract people for pathway analysis
779
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
780
+
781
+ # Analyze if people form a line (typical of shopping streets)
782
+ people_positions = [obj["normalized_center"] for obj in people_objs]
783
+
784
+ structured_path = False
785
+ if len(people_positions) >= 3:
786
+ # Check if people are arranged along a similar y-coordinate (horizontal path)
787
+ y_coords = [pos[1] for pos in people_positions]
788
+ y_mean = sum(y_coords) / len(y_coords)
789
+ y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
790
+
791
+ horizontal_path = y_variance < 0.05 # Low variance indicates horizontal alignment
792
+
793
+ # Check if people are arranged along a similar x-coordinate (vertical path)
794
+ x_coords = [pos[0] for pos in people_positions]
795
+ x_mean = sum(x_coords) / len(x_coords)
796
+ x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)
797
+
798
+ vertical_path = x_variance < 0.05 # Low variance indicates vertical alignment
799
+
800
+ structured_path = horizontal_path or vertical_path
801
+ path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"
802
+
803
+ # Collect pathway objects (people, bicycles, motorcycles in middle area)
804
+ for obj in detected_objects:
805
+ if obj["class_id"] in [0, 1, 3]: # Person, bicycle, motorcycle
806
+ y_pos = obj["normalized_center"][1]
807
+ # Group by vertical position (middle of image likely pathway)
808
+ if 0.25 <= y_pos <= 0.75:
809
+ region = obj["region"]
810
+ if region not in pathway_regions:
811
+ pathway_regions[region] = []
812
+ pathway_regions[region].append(obj)
813
+ pathway_items.append(obj["class_name"])
814
+
815
+ if pathway_items:
816
+ path_desc = "Pedestrian walkway with people moving through the commercial area"
817
+ if structured_path:
818
+ path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"
819
+
820
+ zones["pedestrian_pathway"] = {
821
+ "region": "middle_center", # Assumption: pathway often in middle
822
+ "objects": list(set(pathway_items)),
823
+ "description": path_desc
824
+ }
825
+
826
+ # Identify vendor zone (small stalls/shops - inferred from context)
827
+ has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects) # bags, bottles, cups
828
+ has_people = any(obj["class_id"] == 0 for obj in detected_objects)
829
+
830
+ if has_small_objects and has_people:
831
+ # Likely vendor areas are where people and small objects cluster
832
+ small_obj_regions = {}
833
+
834
+ for obj in detected_objects:
835
+ if obj["class_id"] in [24, 26, 39, 41, 67]: # bags, bottles, cups, phones
836
+ region = obj["region"]
837
+ if region not in small_obj_regions:
838
+ small_obj_regions[region] = []
839
+ small_obj_regions[region].append(obj)
840
+
841
+ if small_obj_regions:
842
+ main_vendor_region = max(small_obj_regions.items(),
843
+ key=lambda x: len(x[1]),
844
+ default=(None, []))
845
+
846
+ if main_vendor_region[0] is not None:
847
+ vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
848
+ zones["vendor_zone"] = {
849
+ "region": main_vendor_region[0],
850
+ "objects": list(set(vendor_items)),
851
+ "description": "Vendor or market stall area with small merchandise"
852
+ }
853
+
854
+ # For night markets, identify illuminated zones
855
+ if scene_type == "asian_night_market":
856
+ # Night markets typically have bright spots for food stalls
857
+ # This would be enhanced with lighting analysis integration
858
+ zones["food_stall_zone"] = {
859
+ "region": "middle_center",
860
+ "objects": ["inferred food stalls"],
861
+ "description": "Food stall area typical of Asian night markets"
862
+ }
863
+
864
+ return zones
865
+
866
+ def _identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
867
+ """
868
+ Identify functional zones for upscale dining settings.
869
+
870
+ Args:
871
+ category_regions: Objects grouped by category and region
872
+ detected_objects: List of detected objects
873
+
874
+ Returns:
875
+ Dict: Upscale dining functional zones
876
+ """
877
+ zones = {}
878
+
879
+ # Identify dining table zone
880
+ dining_items = []
881
+ dining_regions = {}
882
+
883
+ for obj in detected_objects:
884
+ if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]: # Wine glass, cup, fork, knife, spoon, bowl, table
885
+ region = obj["region"]
886
+ if region not in dining_regions:
887
+ dining_regions[region] = []
888
+ dining_regions[region].append(obj)
889
+ dining_items.append(obj["class_name"])
890
+
891
+ if dining_items:
892
+ main_dining_region = max(dining_regions.items(),
893
+ key=lambda x: len(x[1]),
894
+ default=(None, []))
895
+
896
+ if main_dining_region[0] is not None:
897
+ zones["formal_dining_zone"] = {
898
+ "region": main_dining_region[0],
899
+ "objects": list(set(dining_items)),
900
+ "description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
901
+ }
902
+
903
+ # Identify decorative zone with enhanced detection
904
+ decor_items = []
905
+ decor_regions = {}
906
+
907
+ # Look for decorative elements (vases, wine glasses, unused dishes)
908
+ for obj in detected_objects:
909
+ if obj["class_id"] in [75, 40]: # Vase, wine glass
910
+ region = obj["region"]
911
+ if region not in decor_regions:
912
+ decor_regions[region] = []
913
+ decor_regions[region].append(obj)
914
+ decor_items.append(obj["class_name"])
915
+
916
+ if decor_items:
917
+ main_decor_region = max(decor_regions.items(),
918
+ key=lambda x: len(x[1]),
919
+ default=(None, []))
920
+
921
+ if main_decor_region[0] is not None:
922
+ zones["decorative_zone"] = {
923
+ "region": main_decor_region[0],
924
+ "objects": list(set(decor_items)),
925
+ "description": f"Decorative area with {', '.join(list(set(decor_items)))}"
926
+ }
927
+
928
+ # Identify seating arrangement zone
929
+ chairs = [obj for obj in detected_objects if obj["class_id"] == 56] # chairs
930
+ if len(chairs) >= 2:
931
+ chair_regions = {}
932
+ for obj in chairs:
933
+ region = obj["region"]
934
+ if region not in chair_regions:
935
+ chair_regions[region] = []
936
+ chair_regions[region].append(obj)
937
+
938
+ if chair_regions:
939
+ main_seating_region = max(chair_regions.items(),
940
+ key=lambda x: len(x[1]),
941
+ default=(None, []))
942
+
943
+ if main_seating_region[0] is not None:
944
+ zones["dining_seating_zone"] = {
945
+ "region": main_seating_region[0],
946
+ "objects": ["chair"] * len(main_seating_region[1]),
947
+ "description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
948
+ }
949
+
950
+ # Identify serving area (if different from dining area)
951
+ serving_items = []
952
+ serving_regions = {}
953
+
954
+ # Serving areas might have bottles, bowls, containers
955
+ for obj in detected_objects:
956
+ if obj["class_id"] in [39, 45]: # Bottle, bowl
957
+ # Check if it's in a different region from the main dining table
958
+ if "formal_dining_zone" in zones and obj["region"] != zones["formal_dining_zone"]["region"]:
959
+ region = obj["region"]
960
+ if region not in serving_regions:
961
+ serving_regions[region] = []
962
+ serving_regions[region].append(obj)
963
+ serving_items.append(obj["class_name"])
964
+
965
+ if serving_items:
966
+ main_serving_region = max(serving_regions.items(),
967
+ key=lambda x: len(x[1]),
968
+ default=(None, []))
969
+
970
+ if main_serving_region[0] is not None:
971
+ zones["serving_zone"] = {
972
+ "region": main_serving_region[0],
973
+ "objects": list(set(serving_items)),
974
+ "description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
975
+ }
976
+
977
+ return zones
978
+
979
+ def _identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
980
+ """
981
+ Identify functional zones for financial district scenes.
982
+
983
+ Args:
984
+ category_regions: Objects grouped by category and region
985
+ detected_objects: List of detected objects
986
+
987
+ Returns:
988
+ Dict: Financial district functional zones
989
+ """
990
+ zones = {}
991
+
992
+ # Identify traffic zone
993
+ traffic_items = []
994
+ traffic_regions = {}
995
+
996
+ for obj in detected_objects:
997
+ if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]: # Various vehicles and traffic lights
998
+ region = obj["region"]
999
+ if region not in traffic_regions:
1000
+ traffic_regions[region] = []
1001
+ traffic_regions[region].append(obj)
1002
+ traffic_items.append(obj["class_name"])
1003
+
1004
+ if traffic_items:
1005
+ main_traffic_region = max(traffic_regions.items(),
1006
+ key=lambda x: len(x[1]),
1007
+ default=(None, []))
1008
+
1009
+ if main_traffic_region[0] is not None:
1010
+ zones["traffic_zone"] = {
1011
+ "region": main_traffic_region[0],
1012
+ "objects": list(set(traffic_items)),
1013
+ "description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
1014
+ }
1015
+
1016
+ # Building zones on the sides (inferred from scene context)
1017
+ # Enhanced to check if there are actual regions that might contain buildings
1018
+ # Check for regions without vehicles or pedestrians - likely building areas
1019
+ left_side_regions = ["top_left", "middle_left", "bottom_left"]
1020
+ right_side_regions = ["top_right", "middle_right", "bottom_right"]
1021
+
1022
+ # Check left side
1023
+ left_building_evidence = True
1024
+ for region in left_side_regions:
1025
+ # If many vehicles or people in this region, less likely to be buildings
1026
+ vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
1027
+ for obj in detected_objects)
1028
+ people_in_region = any(obj["region"] == region and obj["class_id"] == 0
1029
+ for obj in detected_objects)
1030
+
1031
+ if vehicle_in_region or people_in_region:
1032
+ left_building_evidence = False
1033
+ break
1034
+
1035
+ # Check right side
1036
+ right_building_evidence = True
1037
+ for region in right_side_regions:
1038
+ # If many vehicles or people in this region, less likely to be buildings
1039
+ vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
1040
+ for obj in detected_objects)
1041
+ people_in_region = any(obj["region"] == region and obj["class_id"] == 0
1042
+ for obj in detected_objects)
1043
+
1044
+ if vehicle_in_region or people_in_region:
1045
+ right_building_evidence = False
1046
+ break
1047
+
1048
+ # Add building zones if evidence supports them
1049
+ if left_building_evidence:
1050
+ zones["building_zone_left"] = {
1051
+ "region": "middle_left",
1052
+ "objects": ["building"], # Inferred
1053
+ "description": "Tall buildings line the left side of the street"
1054
+ }
1055
+
1056
+ if right_building_evidence:
1057
+ zones["building_zone_right"] = {
1058
+ "region": "middle_right",
1059
+ "objects": ["building"], # Inferred
1060
+ "description": "Tall buildings line the right side of the street"
1061
+ }
1062
+
1063
+ # Identify pedestrian zone if people are present
1064
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1065
+ if people_objs:
1066
+ people_regions = {}
1067
+ for obj in people_objs:
1068
+ region = obj["region"]
1069
+ if region not in people_regions:
1070
+ people_regions[region] = []
1071
+ people_regions[region].append(obj)
1072
+
1073
+ if people_regions:
1074
+ main_pedestrian_region = max(people_regions.items(),
1075
+ key=lambda x: len(x[1]),
1076
+ default=(None, []))
1077
+
1078
+ if main_pedestrian_region[0] is not None:
1079
+ zones["pedestrian_zone"] = {
1080
+ "region": main_pedestrian_region[0],
1081
+ "objects": ["person"] * len(main_pedestrian_region[1]),
1082
+ "description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
1083
+ }
1084
+
1085
+ return zones
1086
+
1087
+ def _identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
1088
+ """
1089
+ Identify functional zones for scenes viewed from an aerial perspective.
1090
+
1091
+ Args:
1092
+ category_regions: Objects grouped by category and region
1093
+ detected_objects: List of detected objects
1094
+ scene_type: Specific scene type
1095
+
1096
+ Returns:
1097
+ Dict: Aerial view functional zones
1098
+ """
1099
+ zones = {}
1100
+
1101
+ # For aerial views, we focus on patterns and flows rather than specific zones
1102
+
1103
+ # Identify pedestrian patterns
1104
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1105
+ if people_objs:
1106
+ # Convert positions to arrays for pattern analysis
1107
+ positions = np.array([obj["normalized_center"] for obj in people_objs])
1108
+
1109
+ if len(positions) >= 3:
1110
+ # Calculate distribution metrics
1111
+ x_coords = positions[:, 0]
1112
+ y_coords = positions[:, 1]
1113
+
1114
+ x_mean = np.mean(x_coords)
1115
+ y_mean = np.mean(y_coords)
1116
+ x_std = np.std(x_coords)
1117
+ y_std = np.std(y_coords)
1118
+
1119
+ # Determine if people are organized in a linear pattern
1120
+ if x_std < 0.1 or y_std < 0.1:
1121
+ # Linear distribution along one axis
1122
+ pattern_direction = "vertical" if x_std < y_std else "horizontal"
1123
+
1124
+ zones["pedestrian_pattern"] = {
1125
+ "region": "central",
1126
+ "objects": ["person"] * len(people_objs),
1127
+ "description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
1128
+ }
1129
+ else:
1130
+ # More dispersed pattern
1131
+ zones["pedestrian_distribution"] = {
1132
+ "region": "wide",
1133
+ "objects": ["person"] * len(people_objs),
1134
+ "description": f"Aerial view shows pedestrians distributed across the area"
1135
+ }
1136
+
1137
+ # Identify vehicle patterns for traffic analysis
1138
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
1139
+ if vehicle_objs:
1140
+ # Convert positions to arrays for pattern analysis
1141
+ positions = np.array([obj["normalized_center"] for obj in vehicle_objs])
1142
+
1143
+ if len(positions) >= 2:
1144
+ # Calculate distribution metrics
1145
+ x_coords = positions[:, 0]
1146
+ y_coords = positions[:, 1]
1147
+
1148
+ x_mean = np.mean(x_coords)
1149
+ y_mean = np.mean(y_coords)
1150
+ x_std = np.std(x_coords)
1151
+ y_std = np.std(y_coords)
1152
+
1153
+ # Determine if vehicles are organized in lanes
1154
+ if x_std < y_std * 0.5:
1155
+ # Vehicles aligned vertically - indicates north-south traffic
1156
+ zones["vertical_traffic_flow"] = {
1157
+ "region": "central_vertical",
1158
+ "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1159
+ "description": "North-south traffic flow visible from aerial view"
1160
+ }
1161
+ elif y_std < x_std * 0.5:
1162
+ # Vehicles aligned horizontally - indicates east-west traffic
1163
+ zones["horizontal_traffic_flow"] = {
1164
+ "region": "central_horizontal",
1165
+ "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1166
+ "description": "East-west traffic flow visible from aerial view"
1167
+ }
1168
+ else:
1169
+ # Vehicles in multiple directions - indicates intersection
1170
+ zones["intersection_traffic"] = {
1171
+ "region": "central",
1172
+ "objects": [obj["class_name"] for obj in vehicle_objs[:5]],
1173
+ "description": "Multi-directional traffic at intersection visible from aerial view"
1174
+ }
1175
+
1176
+ # For intersection specific aerial views, identify crossing patterns
1177
+ if "intersection" in scene_type:
1178
+ # Check for traffic signals
1179
+ traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
1180
+ if traffic_light_objs:
1181
+ zones["traffic_control_pattern"] = {
1182
+ "region": "intersection",
1183
+ "objects": ["traffic light"] * len(traffic_light_objs),
1184
+ "description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
1185
+ }
1186
+
1187
+ # Crosswalks are inferred from context in aerial views
1188
+ zones["crossing_pattern"] = {
1189
+ "region": "central",
1190
+ "objects": ["inferred crosswalk"],
1191
+ "description": "Crossing pattern visible from aerial perspective"
1192
+ }
1193
+
1194
+ # For plaza aerial views, identify gathering patterns
1195
+ if "plaza" in scene_type:
1196
+ # Plazas typically have central open area with people
1197
+ if people_objs:
1198
+ # Check if people are clustered in central region
1199
+ central_people = [obj for obj in people_objs
1200
+ if "middle" in obj["region"]]
1201
+
1202
+ if central_people:
1203
+ zones["central_gathering"] = {
1204
+ "region": "middle_center",
1205
+ "objects": ["person"] * len(central_people),
1206
+ "description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
1207
+ }
1208
+
1209
+ return zones
1210
+
1211
+ def _identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
1212
+ """
1213
+ Identify functional zones for general outdoor scenes.
1214
+
1215
+ Args:
1216
+ category_regions: Objects grouped by category and region
1217
+ detected_objects: List of detected objects
1218
+ scene_type: Specific outdoor scene type
1219
+
1220
+ Returns:
1221
+ Dict: Outdoor functional zones
1222
+ """
1223
+ zones = {}
1224
+
1225
+ # Identify pedestrian zones
1226
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1227
+ if people_objs:
1228
+ people_regions = {}
1229
+ for obj in people_objs:
1230
+ region = obj["region"]
1231
+ if region not in people_regions:
1232
+ people_regions[region] = []
1233
+ people_regions[region].append(obj)
1234
+
1235
+ if people_regions:
1236
+ # Find main pedestrian areas
1237
+ main_people_regions = sorted(people_regions.items(),
1238
+ key=lambda x: len(x[1]),
1239
+ reverse=True)[:2] # Top 2 regions
1240
+
1241
+ for idx, (region, objs) in enumerate(main_people_regions):
1242
+ if len(objs) > 0:
1243
+ zones[f"pedestrian_zone_{idx+1}"] = {
1244
+ "region": region,
1245
+ "objects": ["person"] * len(objs),
1246
+ "description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
1247
+ }
1248
+
1249
+ # Identify vehicle zones for streets and parking lots
1250
+ vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
1251
+ if vehicle_objs:
1252
+ vehicle_regions = {}
1253
+ for obj in vehicle_objs:
1254
+ region = obj["region"]
1255
+ if region not in vehicle_regions:
1256
+ vehicle_regions[region] = []
1257
+ vehicle_regions[region].append(obj)
1258
+
1259
+ if vehicle_regions:
1260
+ main_vehicle_region = max(vehicle_regions.items(),
1261
+ key=lambda x: len(x[1]),
1262
+ default=(None, []))
1263
+
1264
+ if main_vehicle_region[0] is not None:
1265
+ vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
1266
+ zones["vehicle_zone"] = {
1267
+ "region": main_vehicle_region[0],
1268
+ "objects": vehicle_types,
1269
+ "description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
1270
+ }
1271
+
1272
+ # For park areas, identify recreational zones
1273
+ if scene_type == "park_area":
1274
+ # Look for recreational objects (sports balls, kites, etc.)
1275
+ rec_items = []
1276
+ rec_regions = {}
1277
+
1278
+ for obj in detected_objects:
1279
+ if obj["class_id"] in [32, 33, 34, 35, 38]: # sports ball, kite, baseball bat, glove, tennis racket
1280
+ region = obj["region"]
1281
+ if region not in rec_regions:
1282
+ rec_regions[region] = []
1283
+ rec_regions[region].append(obj)
1284
+ rec_items.append(obj["class_name"])
1285
+
1286
+ if rec_items:
1287
+ main_rec_region = max(rec_regions.items(),
1288
+ key=lambda x: len(x[1]),
1289
+ default=(None, []))
1290
+
1291
+ if main_rec_region[0] is not None:
1292
+ zones["recreational_zone"] = {
1293
+ "region": main_rec_region[0],
1294
+ "objects": list(set(rec_items)),
1295
+ "description": f"Recreational area with {', '.join(list(set(rec_items)))}"
1296
+ }
1297
+
1298
+ # For parking lots, identify parking zones
1299
+ if scene_type == "parking_lot":
1300
+ # Look for parked cars with consistent spacing
1301
+ car_objs = [obj for obj in detected_objects if obj["class_id"] == 2] # cars
1302
+
1303
+ if len(car_objs) >= 3:
1304
+ # Check if cars are arranged in patterns (simplified)
1305
+ car_positions = [obj["normalized_center"] for obj in car_objs]
1306
+
1307
+ # Check for row patterns by analyzing vertical positions
1308
+ y_coords = [pos[1] for pos in car_positions]
1309
+ y_clusters = {}
1310
+
1311
+ # Simplified clustering - group cars by similar y-coordinates
1312
+ for i, y in enumerate(y_coords):
1313
+ assigned = False
1314
+ for cluster_y in y_clusters.keys():
1315
+ if abs(y - cluster_y) < 0.1: # Within 10% of image height
1316
+ y_clusters[cluster_y].append(i)
1317
+ assigned = True
1318
+ break
1319
+
1320
+ if not assigned:
1321
+ y_clusters[y] = [i]
1322
+
1323
+ # If we have row patterns
1324
+ if max(len(indices) for indices in y_clusters.values()) >= 2:
1325
+ zones["parking_row"] = {
1326
+ "region": "central",
1327
+ "objects": ["car"] * len(car_objs),
1328
+ "description": f"Organized parking area with vehicles arranged in rows"
1329
+ }
1330
+ else:
1331
+ zones["parking_area"] = {
1332
+ "region": "wide",
1333
+ "objects": ["car"] * len(car_objs),
1334
+ "description": f"Parking area with {len(car_objs)} vehicles"
1335
+ }
1336
+
1337
+ return zones
1338
+
1339
+ def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
1340
+ """
1341
+ Identify general functional zones when no specific scene type is matched.
1342
+
1343
+ Args:
1344
+ category_regions: Objects grouped by category and region
1345
+ detected_objects: List of detected objects
1346
+
1347
+ Returns:
1348
+ Dict: Default functional zones
1349
+ """
1350
+ zones = {}
1351
+
1352
+ # Group objects by category and find main concentrations
1353
+ for category, regions in category_regions.items():
1354
+ if not regions:
1355
+ continue
1356
+
1357
+ # Find region with most objects in this category
1358
+ main_region = max(regions.items(),
1359
+ key=lambda x: len(x[1]),
1360
+ default=(None, []))
1361
+
1362
+ if main_region[0] is None or len(main_region[1]) < 2:
1363
+ continue
1364
+
1365
+ # Create zone based on object category
1366
+ zone_objects = [obj["class_name"] for obj in main_region[1]]
1367
+
1368
+ # Skip if too few objects
1369
+ if len(zone_objects) < 2:
1370
+ continue
1371
+
1372
+ # Create appropriate zone name and description based on category
1373
+ if category == "furniture":
1374
+ zones["furniture_zone"] = {
1375
+ "region": main_region[0],
1376
+ "objects": zone_objects,
1377
+ "description": f"Area with furniture including {', '.join(zone_objects[:3])}"
1378
+ }
1379
+ elif category == "electronics":
1380
+ zones["electronics_zone"] = {
1381
+ "region": main_region[0],
1382
+ "objects": zone_objects,
1383
+ "description": f"Area with electronic devices including {', '.join(zone_objects[:3])}"
1384
+ }
1385
+ elif category == "kitchen_items":
1386
+ zones["dining_zone"] = {
1387
+ "region": main_region[0],
1388
+ "objects": zone_objects,
1389
+ "description": f"Dining or food area with {', '.join(zone_objects[:3])}"
1390
+ }
1391
+ elif category == "vehicles":
1392
+ zones["vehicle_zone"] = {
1393
+ "region": main_region[0],
1394
+ "objects": zone_objects,
1395
+ "description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
1396
+ }
1397
+ elif category == "personal_items":
1398
+ zones["personal_items_zone"] = {
1399
+ "region": main_region[0],
1400
+ "objects": zone_objects,
1401
+ "description": f"Area with personal items including {', '.join(zone_objects[:3])}"
1402
+ }
1403
+
1404
+ # Check for people groups
1405
+ people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
1406
+ if len(people_objs) >= 2:
1407
+ people_regions = {}
1408
+ for obj in people_objs:
1409
+ region = obj["region"]
1410
+ if region not in people_regions:
1411
+ people_regions[region] = []
1412
+ people_regions[region].append(obj)
1413
+
1414
+ if people_regions:
1415
+ main_people_region = max(people_regions.items(),
1416
+ key=lambda x: len(x[1]),
1417
+ default=(None, []))
1418
+
1419
+ if main_people_region[0] is not None:
1420
+ zones["people_zone"] = {
1421
+ "region": main_people_region[0],
1422
+ "objects": ["person"] * len(main_people_region[1]),
1423
+ "description": f"Area with {len(main_people_region[1])} people"
1424
+ }
1425
+
1426
+ return zones
1427
+
1428
+ def _find_main_region(self, region_objects_dict: Dict) -> str:
1429
+ """Find the main region with the most objects"""
1430
+ if not region_objects_dict:
1431
+ return "unknown"
1432
+
1433
+ return max(region_objects_dict.items(),
1434
+ key=lambda x: len(x[1]),
1435
+ default=("unknown", []))[0]
1436
+
1437
+ def _find_main_region(self, region_objects_dict: Dict) -> str:
1438
+ """Find the main region with the most objects"""
1439
+ if not region_objects_dict:
1440
+ return "unknown"
1441
+
1442
+ return max(region_objects_dict.items(),
1443
+ key=lambda x: len(x[1]),
1444
+ default=("unknown", []))[0]
street_04.jpg ADDED

Git LFS Details

  • SHA256: 1eb06464cdb80a96171d511f985b57b79c32df0f6cae38dfbc08e5cd4fb0acec
  • Pointer size: 132 Bytes
  • Size of remote file: 5.61 MB
style.py CHANGED
@@ -1,7 +1,9 @@
 
1
  class Style:
 
2
  @staticmethod
3
  def get_css():
4
- """Return the application's CSS styles with improved aesthetics"""
5
  css = """
6
  /* Base styles and typography */
7
  body {
@@ -13,20 +15,20 @@ class Style:
13
  justify-content: center;
14
  min-height: 100vh;
15
  }
16
-
17
  /* Typography improvements */
18
  h1, h2, h3, h4, h5, h6, p, span, div, label, button {
19
  font-family: Arial, sans-serif;
20
  }
21
-
22
  /* Container styling */
23
  .gradio-container {
24
  max-width: 1200px !important;
25
- margin: 0 auto;
26
  padding: 1rem;
27
  width: 100%;
28
  }
29
-
30
  /* Header area styling with gradient background */
31
  .app-header {
32
  text-align: center;
@@ -37,7 +39,7 @@ class Style:
37
  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
38
  width: 100%;
39
  }
40
-
41
  .app-title {
42
  color: #2D3748;
43
  font-size: 2.5rem;
@@ -47,21 +49,21 @@ class Style:
47
  -webkit-text-fill-color: transparent;
48
  font-weight: bold;
49
  }
50
-
51
  .app-subtitle {
52
  color: #4A5568;
53
  font-size: 1.2rem;
54
  font-weight: normal;
55
  margin-top: 0.25rem;
56
  }
57
-
58
  .app-divider {
59
  width: 80px;
60
  height: 3px;
61
  background: linear-gradient(90deg, #38b2ac, #4299e1);
62
  margin: 1rem auto;
63
  }
64
-
65
  /* Panel styling - gradient background */
66
  .input-panel, .output-panel {
67
  background: white;
@@ -70,20 +72,20 @@ class Style:
70
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
71
  margin: 0 auto 1rem auto;
72
  }
73
-
74
- /* Section heading styling with gradient background */
75
- .section-heading {
76
- font-size: 1.25rem;
77
- font-weight: 600;
78
- color: #2D3748;
79
- margin-bottom: 1rem;
80
- margin-top: 0.5rem;
81
- text-align: center;
82
- padding: 0.8rem;
83
- background: linear-gradient(to right, #e6f3fc, #f0f9ff);
84
- border-radius: 8px;
85
  }
86
-
 
 
 
 
 
87
  /* How-to-use section with gradient background */
88
  .how-to-use {
89
  background: linear-gradient(135deg, #f8fafc, #e8f4fd);
@@ -93,7 +95,7 @@ class Style:
93
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
94
  color: #2d3748;
95
  }
96
-
97
  /* Detection button styling */
98
  .detect-btn {
99
  background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
@@ -108,41 +110,40 @@ class Style:
108
  margin: 1rem auto !important;
109
  font-family: Arial, sans-serif !important;
110
  }
111
-
112
  .detect-btn:hover {
113
  transform: translateY(-2px) !important;
114
  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2) !important;
115
  }
116
-
117
  .detect-btn:active {
118
  transform: translateY(1px) !important;
119
  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2) !important;
120
  }
121
-
122
  /* JSON display improvements */
123
- .json-display pre {
124
- background: #f8fafc;
125
- border-radius: 6px;
126
- padding: 1rem;
127
- font-family: 'Consolas', 'Monaco', monospace;
128
- white-space: pre-wrap;
129
- max-height: 500px;
130
- overflow-y: auto;
131
- box-shadow: inset 0 0 4px rgba(0, 0, 0, 0.1);
132
  }
133
-
134
  .json-key {
135
  color: #e53e3e;
136
  }
137
-
138
  .json-value {
139
  color: #2b6cb0;
140
  }
141
-
142
  .json-string {
143
  color: #38a169;
144
  }
145
-
146
  /* Chart/plot styling improvements */
147
  .plot-container {
148
  background: white;
@@ -150,32 +151,39 @@ class Style:
150
  padding: 0.5rem;
151
  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
152
  }
153
-
154
  /* Larger font for plots */
155
  .plot-container text {
156
  font-family: Arial, sans-serif !important;
157
  font-size: 14px !important;
158
  }
159
-
160
  /* Title styling for charts */
161
  .plot-title {
162
  font-family: Arial, sans-serif !important;
163
  font-size: 16px !important;
164
  font-weight: bold !important;
165
  }
166
-
167
  /* Tab styling with subtle gradient */
168
  .tabs {
169
  width: 100%;
170
  display: flex;
171
  justify-content: center;
172
  }
173
-
174
  .tabs > div:first-child {
175
  background: linear-gradient(to right, #f8fafc, #e8f4fd) !important;
176
  border-radius: 8px 8px 0 0;
177
  }
178
-
 
 
 
 
 
 
 
179
  /* Footer styling with gradient background */
180
  .footer {
181
  text-align: center;
@@ -188,7 +196,7 @@ class Style:
188
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
189
  width: 100%;
190
  }
191
-
192
  /* Ensure centering works for all elements */
193
  .container, .gr-container, .gr-row, .gr-col {
194
  display: flex;
@@ -197,86 +205,175 @@ class Style:
197
  justify-content: center;
198
  width: 100%;
199
  }
200
-
201
- /* 結果文本框的改進樣式 */
202
- #detection-details, .wide-result-text {
203
  width: 100% !important;
204
  max-width: 100% !important;
 
205
  box-sizing: border-box !important;
206
  }
207
-
208
- .wide-result-text textarea {
 
209
  width: 100% !important;
210
- min-width: 600px !important;
 
 
 
211
  font-family: 'Arial', sans-serif !important;
212
  font-size: 14px !important;
213
- line-height: 1.5 !important; /* 減少行間距 */
 
 
 
 
 
 
 
 
 
 
214
  padding: 16px !important;
 
215
  white-space: pre-wrap !important;
216
- background-color: #f8f9fa !important;
217
  border-radius: 8px !important;
218
- min-height: 300px !important;
219
- resize: none !important;
220
  overflow-y: auto !important;
221
  border: 1px solid #e2e8f0 !important;
 
222
  display: block !important;
 
 
 
223
  }
224
-
225
- /* 結果詳情面板樣式 - 加入漸層背景 */
226
- .result-details-box {
227
- width: 100% !important;
228
- margin-top: 1.5rem;
229
- background: linear-gradient(135deg, #f8fafc, #e8f4fd);
230
- border-radius: 10px;
231
- padding: 1rem;
232
- box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
233
- }
234
-
235
- /* 確保結果詳情面板內的元素寬度可以適應面板 */
236
- .result-details-box > * {
237
  width: 100% !important;
238
  max-width: 100% !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  }
240
-
241
- /* 確保文本區域不會被限制寬度 */
242
- .result-details-box .gr-text-input {
243
  width: 100% !important;
244
- max-width: none !important;
 
 
 
 
 
245
  }
246
-
247
- /* 輸出面板內容的布局調整 */
248
- .output-panel {
249
- display: flex;
250
- flex-direction: column;
251
- width: 100%;
252
  padding: 0 !important;
 
253
  }
254
-
255
- /* 確保結果面板內的元素寬度可以適應面板 */
256
- .output-panel > * {
257
- width: 100%;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  }
259
-
260
- /* 改善統計面板列佈局 */
261
  .plot-column, .stats-column {
262
  display: flex;
263
  flex-direction: column;
264
  padding: 1rem;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  }
266
-
267
- /* Responsive adjustments */
 
 
 
 
 
 
 
268
  @media (max-width: 768px) {
269
  .app-title {
270
  font-size: 2rem;
271
  }
272
-
273
  .app-subtitle {
274
  font-size: 1rem;
275
  }
276
-
277
  .gradio-container {
278
  padding: 0.5rem;
279
  }
 
 
 
 
 
280
  }
281
  """
282
  return css
 
1
+
2
  class Style:
3
+
4
  @staticmethod
5
  def get_css():
6
+
7
  css = """
8
  /* Base styles and typography */
9
  body {
 
15
  justify-content: center;
16
  min-height: 100vh;
17
  }
18
+
19
  /* Typography improvements */
20
  h1, h2, h3, h4, h5, h6, p, span, div, label, button {
21
  font-family: Arial, sans-serif;
22
  }
23
+
24
  /* Container styling */
25
  .gradio-container {
26
  max-width: 1200px !important;
27
+ margin: auto !important;
28
  padding: 1rem;
29
  width: 100%;
30
  }
31
+
32
  /* Header area styling with gradient background */
33
  .app-header {
34
  text-align: center;
 
39
  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
40
  width: 100%;
41
  }
42
+
43
  .app-title {
44
  color: #2D3748;
45
  font-size: 2.5rem;
 
49
  -webkit-text-fill-color: transparent;
50
  font-weight: bold;
51
  }
52
+
53
  .app-subtitle {
54
  color: #4A5568;
55
  font-size: 1.2rem;
56
  font-weight: normal;
57
  margin-top: 0.25rem;
58
  }
59
+
60
  .app-divider {
61
  width: 80px;
62
  height: 3px;
63
  background: linear-gradient(90deg, #38b2ac, #4299e1);
64
  margin: 1rem auto;
65
  }
66
+
67
  /* Panel styling - gradient background */
68
  .input-panel, .output-panel {
69
  background: white;
 
72
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
73
  margin: 0 auto 1rem auto;
74
  }
75
+
76
+ /* 修改輸出面板確保內容能夠完整顯示 */
77
+ .output-panel {
78
+ display: flex;
79
+ flex-direction: column;
80
+ width: 100%;
81
+ padding: 0 !important;
 
 
 
 
 
82
  }
83
+
84
+ /* 確保輸出面板內的元素寬度可以適應面板 */
85
+ .output-panel > * {
86
+ width: 100%;
87
+ }
88
+
89
  /* How-to-use section with gradient background */
90
  .how-to-use {
91
  background: linear-gradient(135deg, #f8fafc, #e8f4fd);
 
95
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
96
  color: #2d3748;
97
  }
98
+
99
  /* Detection button styling */
100
  .detect-btn {
101
  background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
 
110
  margin: 1rem auto !important;
111
  font-family: Arial, sans-serif !important;
112
  }
113
+
114
  .detect-btn:hover {
115
  transform: translateY(-2px) !important;
116
  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2) !important;
117
  }
118
+
119
  .detect-btn:active {
120
  transform: translateY(1px) !important;
121
  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2) !important;
122
  }
123
+
124
  /* JSON display improvements */
125
+ .json-display {
126
+ width: 98% !important;
127
+ margin: 0.5rem auto 1.5rem auto !important;
128
+ padding: 1rem !important;
129
+ border-radius: 8px !important;
130
+ background-color: white !important;
131
+ border: 1px solid #E2E8F0 !important;
132
+ box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.05) !important;
 
133
  }
134
+
135
  .json-key {
136
  color: #e53e3e;
137
  }
138
+
139
  .json-value {
140
  color: #2b6cb0;
141
  }
142
+
143
  .json-string {
144
  color: #38a169;
145
  }
146
+
147
  /* Chart/plot styling improvements */
148
  .plot-container {
149
  background: white;
 
151
  padding: 0.5rem;
152
  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
153
  }
154
+
155
  /* Larger font for plots */
156
  .plot-container text {
157
  font-family: Arial, sans-serif !important;
158
  font-size: 14px !important;
159
  }
160
+
161
  /* Title styling for charts */
162
  .plot-title {
163
  font-family: Arial, sans-serif !important;
164
  font-size: 16px !important;
165
  font-weight: bold !important;
166
  }
167
+
168
  /* Tab styling with subtle gradient */
169
  .tabs {
170
  width: 100%;
171
  display: flex;
172
  justify-content: center;
173
  }
174
+
175
  .tabs > div:first-child {
176
  background: linear-gradient(to right, #f8fafc, #e8f4fd) !important;
177
  border-radius: 8px 8px 0 0;
178
  }
179
+
180
+ /* Tab content styling - 確保內容區域有足夠寬度 */
181
+ .tab-content {
182
+ width: 100% !important;
183
+ box-sizing: border-box !important;
184
+ padding: 0 !important;
185
+ }
186
+
187
  /* Footer styling with gradient background */
188
  .footer {
189
  text-align: center;
 
196
  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
197
  width: 100%;
198
  }
199
+
200
  /* Ensure centering works for all elements */
201
  .container, .gr-container, .gr-row, .gr-col {
202
  display: flex;
 
205
  justify-content: center;
206
  width: 100%;
207
  }
208
+
209
+ /* 統一文本框樣式,確保寬度一致 */
210
+ .gr-textbox, .gr-textarea, .gr-text-input {
211
  width: 100% !important;
212
  max-width: 100% !important;
213
+ min-width: 100% !important;
214
  box-sizing: border-box !important;
215
  }
216
+
217
+ /* 確保文本區域可以適應容器寬度 */
218
+ textarea.gr-textarea, .gr-textbox textarea, .gr-text-input textarea {
219
  width: 100% !important;
220
+ max-width: 100% !important;
221
+ min-width: 100% !important;
222
+ box-sizing: border-box !important;
223
+ padding: 16px !important;
224
  font-family: 'Arial', sans-serif !important;
225
  font-size: 14px !important;
226
+ line-height: 1.6 !important;
227
+ white-space: pre-wrap !important;
228
+ word-wrap: break-word !important;
229
+ word-break: normal !important;
230
+ }
231
+
232
+ /* 特別針對場景描述文本框樣式增強 */
233
+ #scene-description-text, #detection-details {
234
+ width: 100% !important;
235
+ min-width: 100% !important;
236
+ box-sizing: border-box !important;
237
  padding: 16px !important;
238
+ line-height: 1.8 !important;
239
  white-space: pre-wrap !important;
240
+ word-wrap: break-word !important;
241
  border-radius: 8px !important;
242
+ min-height: 250px !important;
 
243
  overflow-y: auto !important;
244
  border: 1px solid #e2e8f0 !important;
245
+ background-color: white !important;
246
  display: block !important;
247
+ font-family: 'Arial', sans-serif !important;
248
+ font-size: 14px !important;
249
+ margin: 0 !important;
250
  }
251
+
252
+ /* 針對場景描述容器的樣式 */
253
+ .scene-description-container {
 
 
 
 
 
 
 
 
 
 
254
  width: 100% !important;
255
  max-width: 100% !important;
256
+ box-sizing: border-box !important;
257
+ padding: 0 !important;
258
+ margin: 0 !important;
259
+ }
260
+
261
+ /* Scene Understanding Tab 特定樣式 */
262
+ .scene-understanding-tab .result-details-box {
263
+ display: flex !important;
264
+ flex-direction: column !important;
265
+ align-items: stretch !important;
266
+ width: 100% !important;
267
+ box-sizing: border-box !important;
268
+ padding: 0 !important;
269
  }
270
+
271
+ /* 結果容器樣式 */
272
+ .result-container {
273
  width: 100% !important;
274
+ padding: 1rem !important;
275
+ border-radius: 8px !important;
276
+ border: 1px solid #E2E8F0 !important;
277
+ margin-bottom: 1.5rem !important;
278
+ background-color: #F8FAFC !important;
279
+ box-sizing: border-box !important;
280
  }
281
+
282
+ /* 結果文本框的樣式 */
283
+ .wide-result-text {
284
+ width: 100% !important;
285
+ min-width: 100% !important;
286
+ box-sizing: border-box !important;
287
  padding: 0 !important;
288
+ margin: 0 !important;
289
  }
290
+
291
+ /* 片段標題樣式 */
292
+ .section-heading {
293
+ font-size: 1.25rem !important;
294
+ font-weight: 600 !important;
295
+ color: #2D3748 !important;
296
+ margin: 1rem auto !important;
297
+ padding: 0.75rem 1rem !important;
298
+ background: linear-gradient(to right, #e6f3fc, #f0f9ff) !important;
299
+ border-radius: 8px !important;
300
+ width: 98% !important;
301
+ display: inline-block !important;
302
+ box-sizing: border-box !important;
303
+ text-align: center !important;
304
+ overflow: visible !important;
305
+ line-height: 1.5 !important;
306
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
307
+ }
308
+
309
+ /* JSON 顯示區域樣式 */
310
+ .json-box {
311
+ width: 100% !important;
312
+ min-height: 200px !important;
313
+ overflow-y: auto !important;
314
+ background: white !important;
315
+ padding: 1rem !important;
316
+ border-radius: 8px !important;
317
+ box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
318
+ font-family: monospace !important;
319
+ box-sizing: border-box !important;
320
  }
321
+
322
+ /* 欄佈局調整 */
323
  .plot-column, .stats-column {
324
  display: flex;
325
  flex-direction: column;
326
  padding: 1rem;
327
+ box-sizing: border-box !important;
328
+ width: 100% !important;
329
+ }
330
+
331
+ /* statistics plot */
332
+ .large-plot-container {
333
+ width: 100% !important;
334
+ min-height: 400px !important;
335
+ box-sizing: border-box !important;
336
+ }
337
+
338
+ /* 增強 JSON 顯示 */
339
+ .enhanced-json-display {
340
+ background: white !important;
341
+ border-radius: 8px !important;
342
+ padding: 1rem !important;
343
+ box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
344
+ width: 100% !important;
345
+ min-height: 300px !important;
346
+ max-height: 500px !important;
347
+ overflow-y: auto !important;
348
+ font-family: monospace !important;
349
+ box-sizing: border-box !important;
350
  }
351
+
352
+ /* 確保全寬元素真正占滿整個寬度 */
353
+ .full-width-element {
354
+ width: 100% !important;
355
+ max-width: 100% !important;
356
+ box-sizing: border-box !important;
357
+ }
358
+
359
+ /* 響應式調整 */
360
  @media (max-width: 768px) {
361
  .app-title {
362
  font-size: 2rem;
363
  }
364
+
365
  .app-subtitle {
366
  font-size: 1rem;
367
  }
368
+
369
  .gradio-container {
370
  padding: 0.5rem;
371
  }
372
+
373
+ /* 在小螢幕上調整文本區域的高度 */
374
+ #scene-description-text, #detection-details {
375
+ min-height: 150px !important;
376
+ }
377
  }
378
  """
379
  return css
viewpoint_templates.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ VIEWPOINT_TEMPLATES = {
3
+ "eye_level": {
4
+ "prefix": "From a standard eye-level perspective, ",
5
+ "observation": "the scene shows {scene_elements} arranged in a typical front-facing view."
6
+ },
7
+ "aerial": {
8
+ "prefix": "From an aerial perspective, ",
9
+ "observation": "the scene shows {scene_elements} as viewed from above, revealing the spatial layout."
10
+ },
11
+ "elevated": {
12
+ "prefix": "From an elevated viewpoint, ",
13
+ "observation": "the scene presents {scene_elements} with a slight downward angle."
14
+ },
15
+ "low_angle": {
16
+ "prefix": "From a low angle, ",
17
+ "observation": "the scene depicts {scene_elements} from below, emphasizing vertical elements."
18
+ }
19
+ }
visualization_helper.py CHANGED
@@ -74,7 +74,7 @@ class VisualizationHelper:
74
  for box, cls, conf in zip(boxes, classes, confs):
75
  x1, y1, x2, y2 = box
76
  cls_id = int(cls)
77
-
78
  if filter_classes and cls_id not in filter_classes:
79
  continue
80
 
 
74
  for box, cls, conf in zip(boxes, classes, confs):
75
  x1, y1, x2, y2 = box
76
  cls_id = int(cls)
77
+
78
  if filter_classes and cls_id not in filter_classes:
79
  continue
80