Spaces:
Running
on
Zero
Running
on
Zero
Upload 27 files
Browse files- .gitattributes +2 -0
- activity_templates.py +324 -0
- app.py +186 -53
- clip_analyzer.py +389 -0
- clip_prompts.py +265 -0
- color_mapper.py +47 -46
- confifence_templates.py +6 -0
- cultural_templates.py +19 -0
- detection_model.py +26 -26
- enhance_scene_describer.py +1314 -0
- image_processor.py +140 -64
- lighting_analyzer.py +811 -0
- lighting_conditions.py +131 -0
- object_categories.py +8 -0
- object_template_fillers.py +78 -0
- requirements.txt +1 -0
- room_02.jpg +3 -0
- safety_templates.py +5 -0
- scene_analyzer.py +408 -0
- scene_description.py +401 -0
- scene_detail_templates.py +203 -0
- scene_type.py +394 -0
- spatial_analyzer.py +1444 -0
- street_04.jpg +3 -0
- style.py +185 -88
- viewpoint_templates.py +19 -0
- visualization_helper.py +1 -1
.gitattributes
CHANGED
@@ -37,3 +37,5 @@ room_01.jpg filter=lfs diff=lfs merge=lfs -text
|
|
37 |
street_01.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
street_02.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
street_03.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
37 |
street_01.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
street_02.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
street_03.jpg filter=lfs diff=lfs merge=lfs -text
|
40 |
+
room_02.jpg filter=lfs diff=lfs merge=lfs -text
|
41 |
+
street_04.jpg filter=lfs diff=lfs merge=lfs -text
|
activity_templates.py
ADDED
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
ACTIVITY_TEMPLATES = {
|
3 |
+
"living_room": [
|
4 |
+
"Watching TV",
|
5 |
+
"Relaxing on the sofa",
|
6 |
+
"Reading",
|
7 |
+
"Socializing"
|
8 |
+
],
|
9 |
+
"bedroom": [
|
10 |
+
"Sleeping",
|
11 |
+
"Resting",
|
12 |
+
"Getting dressed",
|
13 |
+
"Reading in bed"
|
14 |
+
],
|
15 |
+
"dining_area": [
|
16 |
+
"Eating a meal",
|
17 |
+
"Having a conversation",
|
18 |
+
"Working at table"
|
19 |
+
],
|
20 |
+
"kitchen": [
|
21 |
+
"Cooking",
|
22 |
+
"Food preparation",
|
23 |
+
"Cleaning dishes"
|
24 |
+
],
|
25 |
+
"office_workspace": [
|
26 |
+
"Working on computer",
|
27 |
+
"Office work",
|
28 |
+
"Virtual meetings",
|
29 |
+
"Reading documents"
|
30 |
+
],
|
31 |
+
"meeting_room": [
|
32 |
+
"Group meeting",
|
33 |
+
"Presentation",
|
34 |
+
"Team discussion",
|
35 |
+
"Collaboration"
|
36 |
+
],
|
37 |
+
"city_street": [
|
38 |
+
"Walking",
|
39 |
+
"Commuting",
|
40 |
+
"Shopping",
|
41 |
+
"Waiting for transportation"
|
42 |
+
],
|
43 |
+
"parking_lot": [
|
44 |
+
"Parking vehicles",
|
45 |
+
"Loading/unloading items",
|
46 |
+
"Entering/exiting vehicles"
|
47 |
+
],
|
48 |
+
"park_area": [
|
49 |
+
"Walking",
|
50 |
+
"Relaxing outdoors",
|
51 |
+
"Exercising",
|
52 |
+
"Social gathering"
|
53 |
+
],
|
54 |
+
"retail_store": [
|
55 |
+
"Shopping",
|
56 |
+
"Browsing products",
|
57 |
+
"Purchasing items"
|
58 |
+
],
|
59 |
+
"supermarket": [
|
60 |
+
"Grocery shopping",
|
61 |
+
"Selecting products",
|
62 |
+
"Checking out"
|
63 |
+
],
|
64 |
+
"upscale_dining": [
|
65 |
+
"Fine dining",
|
66 |
+
"Social gathering",
|
67 |
+
"Special occasion meal",
|
68 |
+
"Family dinner",
|
69 |
+
"Business meeting",
|
70 |
+
"Celebratory meal"
|
71 |
+
],
|
72 |
+
"asian_commercial_street": [
|
73 |
+
"Shopping",
|
74 |
+
"Sightseeing",
|
75 |
+
"Walking to destinations",
|
76 |
+
"Visiting local shops",
|
77 |
+
"Cultural exploration",
|
78 |
+
"Urban commuting",
|
79 |
+
"Meeting friends"
|
80 |
+
],
|
81 |
+
"financial_district": [
|
82 |
+
"Commuting",
|
83 |
+
"Business travel",
|
84 |
+
"Urban transit",
|
85 |
+
"Sightseeing",
|
86 |
+
"City navigation",
|
87 |
+
"Professional activities",
|
88 |
+
"Corporate meetings"
|
89 |
+
],
|
90 |
+
"urban_intersection": [
|
91 |
+
"Street crossing",
|
92 |
+
"Waiting for signals",
|
93 |
+
"Urban navigation",
|
94 |
+
"Commuting",
|
95 |
+
"Group movement",
|
96 |
+
"Following traffic patterns",
|
97 |
+
"Pedestrian coordination"
|
98 |
+
],
|
99 |
+
"transit_hub": [
|
100 |
+
"Commuting",
|
101 |
+
"Waiting for transportation",
|
102 |
+
"Transferring between vehicles",
|
103 |
+
"Starting/ending journeys",
|
104 |
+
"Meeting travelers",
|
105 |
+
"Checking transit schedules",
|
106 |
+
"Urban transportation"
|
107 |
+
],
|
108 |
+
"shopping_district": [
|
109 |
+
"Retail shopping",
|
110 |
+
"Window browsing",
|
111 |
+
"Social shopping",
|
112 |
+
"Product comparison",
|
113 |
+
"Making purchases",
|
114 |
+
"Brand exploration",
|
115 |
+
"Recreational shopping"
|
116 |
+
],
|
117 |
+
"bus_stop": [
|
118 |
+
"Waiting for the bus",
|
119 |
+
"Checking schedules",
|
120 |
+
"Boarding or alighting",
|
121 |
+
"Standing under shelter"
|
122 |
+
],
|
123 |
+
"bus_station": [
|
124 |
+
"Navigating between platforms",
|
125 |
+
"Handling luggage",
|
126 |
+
"Boarding buses",
|
127 |
+
"Gathering at waiting areas"
|
128 |
+
],
|
129 |
+
"zoo": [
|
130 |
+
"Watching animal exhibits",
|
131 |
+
"Taking photos of wildlife",
|
132 |
+
"Walking along enclosures",
|
133 |
+
"Reading informational signs"
|
134 |
+
],
|
135 |
+
"harbor": [
|
136 |
+
"Observing docked boats",
|
137 |
+
"Commuting by watercraft",
|
138 |
+
"Loading or unloading cargo",
|
139 |
+
"Strolling along the pier"
|
140 |
+
],
|
141 |
+
"playground": [
|
142 |
+
"Playing ball games",
|
143 |
+
"Swinging or sliding",
|
144 |
+
"Running around",
|
145 |
+
"Socializing with friends"
|
146 |
+
],
|
147 |
+
"sports_field": [
|
148 |
+
"Practicing ball drills",
|
149 |
+
"Competing in matches",
|
150 |
+
"Warming up or stretching",
|
151 |
+
"Team training sessions"
|
152 |
+
],
|
153 |
+
"narrow_commercial_alley": [
|
154 |
+
"Walking through alley",
|
155 |
+
"Browsing storefronts",
|
156 |
+
"Navigating light traffic",
|
157 |
+
"Carrying shopping bags"
|
158 |
+
],
|
159 |
+
"daytime_shopping_street": [
|
160 |
+
"Shopping",
|
161 |
+
"Window browsing",
|
162 |
+
"Street photography",
|
163 |
+
"Commuting by vehicle"
|
164 |
+
],
|
165 |
+
"urban_pedestrian_crossing": [
|
166 |
+
"Crossing the street",
|
167 |
+
"Waiting for signal",
|
168 |
+
"Following traffic rules",
|
169 |
+
"Checking for vehicles"
|
170 |
+
],
|
171 |
+
"aerial_view_intersection": [
|
172 |
+
"Crossing multiple directions",
|
173 |
+
"Following traffic signals",
|
174 |
+
"Navigating pedestrian paths",
|
175 |
+
"Traffic management",
|
176 |
+
"Multi-directional movement",
|
177 |
+
"Organized crossing patterns",
|
178 |
+
"Waiting at signals"
|
179 |
+
],
|
180 |
+
"aerial_view_commercial_area": [
|
181 |
+
"Shopping district navigation",
|
182 |
+
"Retail browsing",
|
183 |
+
"Store-to-store movement",
|
184 |
+
"Commercial zone foot traffic",
|
185 |
+
"Shopping center traversal",
|
186 |
+
"Retail area engagement",
|
187 |
+
"Walking between stores"
|
188 |
+
],
|
189 |
+
"aerial_view_plaza": [
|
190 |
+
"Public gathering",
|
191 |
+
"Open space traversal",
|
192 |
+
"Community congregation",
|
193 |
+
"Plaza navigation",
|
194 |
+
"Public square activities",
|
195 |
+
"Urban space utilization"
|
196 |
+
],
|
197 |
+
"asian_night_market": [
|
198 |
+
"Street food sampling",
|
199 |
+
"Night market browsing",
|
200 |
+
"Evening shopping",
|
201 |
+
"Cultural food exploration",
|
202 |
+
"Vendor interaction",
|
203 |
+
"Social night dining",
|
204 |
+
"Market stall hopping"
|
205 |
+
],
|
206 |
+
"asian_temple_area": [
|
207 |
+
"Temple visiting",
|
208 |
+
"Cultural site exploration",
|
209 |
+
"Spiritual observance",
|
210 |
+
"Traditional rituals",
|
211 |
+
"Historical site appreciation",
|
212 |
+
"Religious tourism",
|
213 |
+
"Cultural photography"
|
214 |
+
],
|
215 |
+
"european_plaza": [
|
216 |
+
"Urban sightseeing",
|
217 |
+
"Historical appreciation",
|
218 |
+
"Tourist photography",
|
219 |
+
"Public space relaxation",
|
220 |
+
"Casual strolling"
|
221 |
+
],
|
222 |
+
"nighttime_street": [
|
223 |
+
"Evening commuting",
|
224 |
+
"Night walking",
|
225 |
+
"After-hours travel",
|
226 |
+
"Nighttime navigation",
|
227 |
+
"Evening errands",
|
228 |
+
"Late-night transportation",
|
229 |
+
"Nocturnal urban movement"
|
230 |
+
],
|
231 |
+
"nighttime_commercial_district": [
|
232 |
+
"Evening shopping",
|
233 |
+
"Nightlife participation",
|
234 |
+
"Nighttime entertainment",
|
235 |
+
"After-dark dining",
|
236 |
+
"Evening social gathering",
|
237 |
+
"Night market browsing",
|
238 |
+
"Illumination appreciation"
|
239 |
+
],
|
240 |
+
"indoor_outdoor_cafe": [
|
241 |
+
"Al fresco dining",
|
242 |
+
"Sidewalk coffee enjoyment",
|
243 |
+
"Indoor-outdoor socializing",
|
244 |
+
"Patio relaxation",
|
245 |
+
"Open-air refreshment",
|
246 |
+
"Transitional space usage",
|
247 |
+
"Weather-dependent positioning"
|
248 |
+
],
|
249 |
+
"transit_station_platform": [
|
250 |
+
"Transit waiting",
|
251 |
+
"Platform navigation",
|
252 |
+
"Boarding preparation",
|
253 |
+
"Arrival monitoring",
|
254 |
+
"Schedule checking",
|
255 |
+
"Departure positioning",
|
256 |
+
"Platform traversal"
|
257 |
+
],
|
258 |
+
"sports_stadium": [
|
259 |
+
"Spectator viewing",
|
260 |
+
"Sports fan cheering",
|
261 |
+
"Game attendance",
|
262 |
+
"Stadium navigation",
|
263 |
+
"Athletic event watching",
|
264 |
+
"Audience participation",
|
265 |
+
"Sports appreciation"
|
266 |
+
],
|
267 |
+
"construction_site": [
|
268 |
+
"Construction work",
|
269 |
+
"Building development",
|
270 |
+
"Site management",
|
271 |
+
"Material handling",
|
272 |
+
"Construction supervision",
|
273 |
+
"Safety monitoring",
|
274 |
+
"Building process"
|
275 |
+
],
|
276 |
+
"medical_facility": [
|
277 |
+
"Healthcare consultation",
|
278 |
+
"Medical treatment",
|
279 |
+
"Patient waiting",
|
280 |
+
"Healthcare delivery",
|
281 |
+
"Medical examination",
|
282 |
+
"Professional care",
|
283 |
+
"Health monitoring"
|
284 |
+
],
|
285 |
+
"educational_setting": [
|
286 |
+
"Classroom learning",
|
287 |
+
"Educational instruction",
|
288 |
+
"Student participation",
|
289 |
+
"Academic engagement",
|
290 |
+
"Knowledge acquisition",
|
291 |
+
"Educational discussion",
|
292 |
+
"Scholastic activities"
|
293 |
+
],
|
294 |
+
"beach_water_recreation": [
|
295 |
+
"Surfing",
|
296 |
+
"Sunbathing",
|
297 |
+
"Beach volleyball",
|
298 |
+
"Swimming",
|
299 |
+
"Relaxing by the water",
|
300 |
+
"Flying beach kites",
|
301 |
+
"Beach picnicking",
|
302 |
+
"Coastal walking"
|
303 |
+
],
|
304 |
+
"sports_venue": [
|
305 |
+
"Professional game playing",
|
306 |
+
"Sports competition",
|
307 |
+
"Athletic training",
|
308 |
+
"Team practice",
|
309 |
+
"Spectator viewing",
|
310 |
+
"Sports coaching",
|
311 |
+
"Tournament participation",
|
312 |
+
"Athletic performance"
|
313 |
+
],
|
314 |
+
"professional_kitchen": [
|
315 |
+
"Professional cooking",
|
316 |
+
"Food preparation",
|
317 |
+
"Meal service coordination",
|
318 |
+
"Kitchen operations",
|
319 |
+
"Culinary production",
|
320 |
+
"Chef activities",
|
321 |
+
"Commercial food handling",
|
322 |
+
"Restaurant meal preparation"
|
323 |
+
]
|
324 |
+
}
|
app.py
CHANGED
@@ -63,48 +63,102 @@ def process_and_plot(image, model_name, confidence_threshold, filter_classes=Non
|
|
63 |
filter_classes: Optional list of classes to filter results
|
64 |
|
65 |
Returns:
|
66 |
-
Tuple of
|
67 |
"""
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
fig, ax = plt.subplots(figsize=(8, 6))
|
93 |
-
ax.text(0.5, 0.5, "
|
94 |
-
ha='center', va='center', fontsize=14, fontfamily='Arial')
|
95 |
ax.set_xlim(0, 1)
|
96 |
ax.set_ylim(0, 1)
|
97 |
ax.axis('off')
|
98 |
-
plot_figure = fig
|
99 |
-
else:
|
100 |
-
# Prepare visualization data
|
101 |
-
available_classes = dict(get_all_classes())
|
102 |
-
viz_data = image_processor.prepare_visualization_data(stats, available_classes)
|
103 |
-
|
104 |
-
# Create plot
|
105 |
-
plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
|
106 |
|
107 |
-
|
|
|
108 |
|
109 |
def create_interface():
|
110 |
"""創建 Gradio 界面,包含美化的視覺效果"""
|
@@ -121,19 +175,43 @@ def create_interface():
|
|
121 |
|
122 |
# 創建 Gradio Blocks 界面
|
123 |
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
|
124 |
-
#
|
125 |
with gr.Group(elem_classes="app-header"):
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
current_model = gr.State("yolov8m.pt") # use medium size model as defualt
|
135 |
|
136 |
-
# 主要內容區
|
137 |
with gr.Row(equal_height=True):
|
138 |
# 左側 - 輸入控制區(可上傳圖片)
|
139 |
with gr.Column(scale=4, elem_classes="input-panel"):
|
@@ -208,8 +286,8 @@ def create_interface():
|
|
208 |
# 文本框設置,讓顯示會更寬
|
209 |
result_text = gr.Textbox(
|
210 |
label=None,
|
211 |
-
lines=
|
212 |
-
max_lines=
|
213 |
elem_classes="wide-result-text",
|
214 |
elem_id="detection-details",
|
215 |
container=False,
|
@@ -217,6 +295,57 @@ def create_interface():
|
|
217 |
min_width=600
|
218 |
)
|
219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
with gr.Tab("Statistics"):
|
221 |
with gr.Row():
|
222 |
with gr.Column(scale=3, elem_classes="plot-column"):
|
@@ -235,10 +364,14 @@ def create_interface():
|
|
235 |
)
|
236 |
|
237 |
detect_btn.click(
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
242 |
|
243 |
# model option
|
244 |
model_dropdown.change(
|
@@ -276,9 +409,9 @@ def create_interface():
|
|
276 |
|
277 |
example_images = [
|
278 |
"room_01.jpg",
|
279 |
-
"
|
280 |
"street_02.jpg",
|
281 |
-
"
|
282 |
]
|
283 |
|
284 |
# add example images
|
|
|
63 |
filter_classes: Optional list of classes to filter results
|
64 |
|
65 |
Returns:
|
66 |
+
Tuple of results including lighting conditions
|
67 |
"""
|
68 |
+
try:
|
69 |
+
class_ids = None
|
70 |
+
if filter_classes:
|
71 |
+
class_ids = []
|
72 |
+
for class_str in filter_classes:
|
73 |
+
try:
|
74 |
+
# Extract ID from format "id: name"
|
75 |
+
class_id = int(class_str.split(":")[0].strip())
|
76 |
+
class_ids.append(class_id)
|
77 |
+
except:
|
78 |
+
continue
|
79 |
+
|
80 |
+
# Execute detection
|
81 |
+
result_image, result_text, stats = image_processor.process_image(
|
82 |
+
image,
|
83 |
+
model_name,
|
84 |
+
confidence_threshold,
|
85 |
+
class_ids
|
86 |
+
)
|
87 |
+
|
88 |
+
# Format the statistics for better display
|
89 |
+
formatted_stats = image_processor.format_json_for_display(stats)
|
90 |
+
|
91 |
+
if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
|
92 |
+
# Create the table
|
93 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
94 |
+
ax.text(0.5, 0.5, "No detection data available",
|
95 |
+
ha='center', va='center', fontsize=14, fontfamily='Arial')
|
96 |
+
ax.set_xlim(0, 1)
|
97 |
+
ax.set_ylim(0, 1)
|
98 |
+
ax.axis('off')
|
99 |
+
plot_figure = fig
|
100 |
+
else:
|
101 |
+
# Prepare visualization data
|
102 |
+
available_classes = dict(get_all_classes())
|
103 |
+
viz_data = image_processor.prepare_visualization_data(stats, available_classes)
|
104 |
+
|
105 |
+
# Create plot
|
106 |
+
plot_figure = EvaluationMetrics.create_enhanced_stats_plot(viz_data)
|
107 |
+
|
108 |
+
# Extract scene analysis info
|
109 |
+
scene_analysis = stats.get("scene_analysis", {})
|
110 |
+
|
111 |
+
scene_desc = scene_analysis.get("description", "No scene analysis available.")
|
112 |
+
scene_desc = scene_desc.strip()
|
113 |
+
|
114 |
+
# HTML format
|
115 |
+
scene_desc_html = f"""
|
116 |
+
<div id='scene-desc-container' style='width:100%; padding:20px; text-align:center; background-color:#f5f9fc; border-radius:8px; margin:10px auto; min-height:200px; max-height:none; overflow-y:auto;'>
|
117 |
+
<div style='width:100%; text-align:center; margin:0 auto; font-family:Arial, sans-serif; font-size:14px; line-height:1.8;'>
|
118 |
+
{scene_desc}
|
119 |
+
</div>
|
120 |
+
</div>
|
121 |
+
"""
|
122 |
+
|
123 |
+
# Extract lighting conditions
|
124 |
+
lighting_conditions = scene_analysis.get("lighting_conditions",
|
125 |
+
{"time_of_day": "unknown", "confidence": 0.0})
|
126 |
+
|
127 |
+
# 準備活動列表
|
128 |
+
activities = scene_analysis.get("possible_activities", [])
|
129 |
+
if not activities:
|
130 |
+
activities_data = [["No activities detected"]]
|
131 |
+
else:
|
132 |
+
activities_data = [[activity] for activity in activities]
|
133 |
+
|
134 |
+
# 準備安全注意事項列表
|
135 |
+
safety_concerns = scene_analysis.get("safety_concerns", [])
|
136 |
+
if not safety_concerns:
|
137 |
+
safety_data = [["No safety concerns detected"]]
|
138 |
+
else:
|
139 |
+
safety_data = [[concern] for concern in safety_concerns]
|
140 |
+
|
141 |
+
# 功能區域
|
142 |
+
zones = scene_analysis.get("functional_zones", {})
|
143 |
+
|
144 |
+
return result_image, result_text, formatted_stats, plot_figure, scene_desc, activities_data, safety_data, zones, lighting_conditions
|
145 |
+
|
146 |
+
except Exception as e:
|
147 |
+
# 添加錯誤處理,確保即使出錯也能返回有效的數據
|
148 |
+
import traceback
|
149 |
+
error_msg = f"Error processing image: {str(e)}\n{traceback.format_exc()}"
|
150 |
+
print(error_msg)
|
151 |
+
|
152 |
+
# 創建一個簡單的錯誤圖
|
153 |
fig, ax = plt.subplots(figsize=(8, 6))
|
154 |
+
ax.text(0.5, 0.5, f"Error: {str(e)}",
|
155 |
+
ha='center', va='center', fontsize=14, fontfamily='Arial', color='red')
|
156 |
ax.set_xlim(0, 1)
|
157 |
ax.set_ylim(0, 1)
|
158 |
ax.axis('off')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
+
# 返回有效的默認值
|
161 |
+
return None, error_msg, "{}", fig, "Error processing image", [["No activities"]], [["No safety concerns"]], {}, {"time_of_day": "unknown", "confidence": 0}
|
162 |
|
163 |
def create_interface():
|
164 |
"""創建 Gradio 界面,包含美化的視覺效果"""
|
|
|
175 |
|
176 |
# 創建 Gradio Blocks 界面
|
177 |
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
|
178 |
+
# 主頁頂部的標題
|
179 |
with gr.Group(elem_classes="app-header"):
|
180 |
+
gr.HTML("""
|
181 |
+
<div style="text-align: center; width: 100%; padding: 2rem 0 3rem 0; background: linear-gradient(135deg, #f0f9ff, #e1f5fe);">
|
182 |
+
<h1 style="font-size: 3.5rem; margin-bottom: 0.5rem; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: bold; font-family: 'Arial', sans-serif;">VisionScout</h1>
|
183 |
+
|
184 |
+
<h2 style="color: #4A5568; font-size: 1.2rem; font-weight: 400; margin-top: 0.5rem; margin-bottom: 1.5rem; font-family: 'Arial', sans-serif;">Detect and identify objects in your images</h2>
|
185 |
+
|
186 |
+
<div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;">
|
187 |
+
<div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div>
|
188 |
+
</div>
|
189 |
+
|
190 |
+
<div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
|
191 |
+
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
|
192 |
+
<span style="margin-right: 6px;">🔍</span> Object Detection
|
193 |
+
</div>
|
194 |
+
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
|
195 |
+
<span style="margin-right: 6px;">🌐</span> Scene Understanding
|
196 |
+
</div>
|
197 |
+
<div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;">
|
198 |
+
<span style="margin-right: 6px;">📊</span> Visual Analysis
|
199 |
+
</div>
|
200 |
+
</div>
|
201 |
+
|
202 |
+
<div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
|
203 |
+
<p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
|
204 |
+
<span style="margin-right: 5px;">📱</span> iPhone users: HEIC images are not supported.
|
205 |
+
<a href="https://cloudconvert.com/heic-to-jpg" target="_blank" style="color: #3182ce; text-decoration: underline;">Convert HEIC to JPG here</a> before uploading.
|
206 |
+
</p>
|
207 |
+
</div>
|
208 |
+
</div>
|
209 |
+
""")
|
210 |
+
|
211 |
|
212 |
current_model = gr.State("yolov8m.pt") # use medium size model as defualt
|
213 |
|
214 |
+
# 主要內容區
|
215 |
with gr.Row(equal_height=True):
|
216 |
# 左側 - 輸入控制區(可上傳圖片)
|
217 |
with gr.Column(scale=4, elem_classes="input-panel"):
|
|
|
286 |
# 文本框設置,讓顯示會更寬
|
287 |
result_text = gr.Textbox(
|
288 |
label=None,
|
289 |
+
lines=15,
|
290 |
+
max_lines=20,
|
291 |
elem_classes="wide-result-text",
|
292 |
elem_id="detection-details",
|
293 |
container=False,
|
|
|
295 |
min_width=600
|
296 |
)
|
297 |
|
298 |
+
# Scene Analysis
|
299 |
+
with gr.Tab("Scene Understanding", elem_classes="scene-understanding-tab"):
|
300 |
+
with gr.Group(elem_classes="result-details-box"):
|
301 |
+
gr.HTML("""
|
302 |
+
<div class="section-heading">Scene Analysis</div>
|
303 |
+
<details class="info-details" style="margin: 5px 0 15px 0;">
|
304 |
+
<summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
|
305 |
+
🔍 The AI Vision Scout Report: Click for important notes about this analysis
|
306 |
+
</summary>
|
307 |
+
<div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
|
308 |
+
<p style="font-size: 13px; color: #718096; margin: 0;">
|
309 |
+
<b>About this analysis:</b> This analysis is the model's best guess based on visible objects.
|
310 |
+
Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
|
311 |
+
Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! 🧐
|
312 |
+
</p>
|
313 |
+
</div>
|
314 |
+
</details>
|
315 |
+
""")
|
316 |
+
|
317 |
+
# 使用更適合長文本的容器
|
318 |
+
with gr.Group(elem_classes="scene-description-container"):
|
319 |
+
scene_description = gr.HTML(
|
320 |
+
value="<div id='scene-desc-container'></div>",
|
321 |
+
label="Scene Description"
|
322 |
+
)
|
323 |
+
|
324 |
+
with gr.Row():
|
325 |
+
with gr.Column(scale=2):
|
326 |
+
activities_list = gr.Dataframe(
|
327 |
+
headers=["Activities"],
|
328 |
+
datatype=["str"],
|
329 |
+
col_count=1,
|
330 |
+
row_count=5,
|
331 |
+
elem_classes="full-width-element"
|
332 |
+
)
|
333 |
+
|
334 |
+
with gr.Column(scale=2):
|
335 |
+
safety_list = gr.Dataframe(
|
336 |
+
headers=["Safety Concerns"],
|
337 |
+
datatype=["str"],
|
338 |
+
col_count=1,
|
339 |
+
row_count=5,
|
340 |
+
elem_classes="full-width-element"
|
341 |
+
)
|
342 |
+
|
343 |
+
gr.HTML('<div class="section-heading">Functional Zones</div>')
|
344 |
+
zones_json = gr.JSON(label=None, elem_classes="json-box")
|
345 |
+
|
346 |
+
gr.HTML('<div class="section-heading">Lighting Conditions</div>')
|
347 |
+
lighting_info = gr.JSON(label=None, elem_classes="json-box")
|
348 |
+
|
349 |
with gr.Tab("Statistics"):
|
350 |
with gr.Row():
|
351 |
with gr.Column(scale=3, elem_classes="plot-column"):
|
|
|
364 |
)
|
365 |
|
366 |
detect_btn.click(
|
367 |
+
fn=process_and_plot,
|
368 |
+
inputs=[image_input, current_model, confidence, class_filter],
|
369 |
+
outputs=[
|
370 |
+
result_image, result_text, stats_json, plot_output,
|
371 |
+
scene_description, activities_list, safety_list, zones_json,
|
372 |
+
lighting_info
|
373 |
+
]
|
374 |
+
)
|
375 |
|
376 |
# model option
|
377 |
model_dropdown.change(
|
|
|
409 |
|
410 |
example_images = [
|
411 |
"room_01.jpg",
|
412 |
+
"room_02.jpg",
|
413 |
"street_02.jpg",
|
414 |
+
"street_04.jpg"
|
415 |
]
|
416 |
|
417 |
# add example images
|
clip_analyzer.py
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import clip
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
from typing import Dict, List, Tuple, Any, Optional, Union
|
6 |
+
from clip_prompts import (
|
7 |
+
SCENE_TYPE_PROMPTS,
|
8 |
+
CULTURAL_SCENE_PROMPTS,
|
9 |
+
COMPARATIVE_PROMPTS,
|
10 |
+
LIGHTING_CONDITION_PROMPTS,
|
11 |
+
SPECIALIZED_SCENE_PROMPTS,
|
12 |
+
VIEWPOINT_PROMPTS,
|
13 |
+
OBJECT_COMBINATION_PROMPTS,
|
14 |
+
ACTIVITY_PROMPTS
|
15 |
+
)
|
16 |
+
|
17 |
+
class CLIPAnalyzer:
|
18 |
+
"""
|
19 |
+
Use Clip to intergrate scene understanding function
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(self, model_name: str = "ViT-B/32", device: str = None):
|
23 |
+
"""
|
24 |
+
初始化 CLIP 分析器。
|
25 |
+
|
26 |
+
Args:
|
27 |
+
model_name: CLIP Model name, "ViT-B/32"、"ViT-B/16"、"ViT-L/14"
|
28 |
+
device: Use GPU if it can use
|
29 |
+
"""
|
30 |
+
# 自動選擇設備
|
31 |
+
if device is None:
|
32 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
33 |
+
else:
|
34 |
+
self.device = device
|
35 |
+
|
36 |
+
print(f"Loading CLIP model {model_name} on {self.device}...")
|
37 |
+
try:
|
38 |
+
self.model, self.preprocess = clip.load(model_name, device=self.device)
|
39 |
+
print(f"CLIP model loaded successfully.")
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error loading CLIP model: {e}")
|
42 |
+
raise
|
43 |
+
|
44 |
+
self.scene_type_prompts = SCENE_TYPE_PROMPTS
|
45 |
+
self.cultural_scene_prompts = CULTURAL_SCENE_PROMPTS
|
46 |
+
self.comparative_prompts = COMPARATIVE_PROMPTS
|
47 |
+
self.lighting_condition_prompts = LIGHTING_CONDITION_PROMPTS
|
48 |
+
self.specialized_scene_prompts = SPECIALIZED_SCENE_PROMPTS
|
49 |
+
self.viewpoint_prompts = VIEWPOINT_PROMPTS
|
50 |
+
self.object_combination_prompts = OBJECT_COMBINATION_PROMPTS
|
51 |
+
self.activity_prompts = ACTIVITY_PROMPTS
|
52 |
+
|
53 |
+
# turn to CLIP format
|
54 |
+
self._prepare_text_prompts()
|
55 |
+
|
56 |
+
def _prepare_text_prompts(self):
|
57 |
+
"""準備所有文本提示的 CLIP 特徵"""
|
58 |
+
# base prompt
|
59 |
+
scene_texts = [self.scene_type_prompts[scene_type] for scene_type in self.scene_type_prompts]
|
60 |
+
self.scene_type_tokens = clip.tokenize(scene_texts).to(self.device)
|
61 |
+
|
62 |
+
# cultural
|
63 |
+
self.cultural_tokens_dict = {}
|
64 |
+
for scene_type, prompts in self.cultural_scene_prompts.items():
|
65 |
+
self.cultural_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
|
66 |
+
|
67 |
+
# Light
|
68 |
+
lighting_texts = [self.lighting_condition_prompts[cond] for cond in self.lighting_condition_prompts]
|
69 |
+
self.lighting_tokens = clip.tokenize(lighting_texts).to(self.device)
|
70 |
+
|
71 |
+
# specializes_status
|
72 |
+
self.specialized_tokens_dict = {}
|
73 |
+
for scene_type, prompts in self.specialized_scene_prompts.items():
|
74 |
+
self.specialized_tokens_dict[scene_type] = clip.tokenize(prompts).to(self.device)
|
75 |
+
|
76 |
+
# view point
|
77 |
+
viewpoint_texts = [self.viewpoint_prompts[viewpoint] for viewpoint in self.viewpoint_prompts]
|
78 |
+
self.viewpoint_tokens = clip.tokenize(viewpoint_texts).to(self.device)
|
79 |
+
|
80 |
+
# object combination
|
81 |
+
object_combination_texts = [self.object_combination_prompts[combo] for combo in self.object_combination_prompts]
|
82 |
+
self.object_combination_tokens = clip.tokenize(object_combination_texts).to(self.device)
|
83 |
+
|
84 |
+
# activicty prompt
|
85 |
+
activity_texts = [self.activity_prompts[activity] for activity in self.activity_prompts]
|
86 |
+
self.activity_tokens = clip.tokenize(activity_texts).to(self.device)
|
87 |
+
|
88 |
+
def analyze_image(self, image, include_cultural_analysis: bool = True) -> Dict[str, Any]:
|
89 |
+
"""
|
90 |
+
分析圖像,預測場景類型和光照條件。
|
91 |
+
|
92 |
+
Args:
|
93 |
+
image: 輸入圖像 (PIL Image 或 numpy array)
|
94 |
+
include_cultural_analysis: 是否包含文化場景的詳細分析
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
Dict: 包含場景類型預測和光照條件的分析結果
|
98 |
+
"""
|
99 |
+
try:
|
100 |
+
# 確保圖像是 PIL 格式
|
101 |
+
if not isinstance(image, Image.Image):
|
102 |
+
if isinstance(image, np.ndarray):
|
103 |
+
image = Image.fromarray(image)
|
104 |
+
else:
|
105 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
106 |
+
|
107 |
+
# 預處理圖像
|
108 |
+
image_input = self.preprocess(image).unsqueeze(0).to(self.device)
|
109 |
+
|
110 |
+
# 獲取圖像特徵
|
111 |
+
with torch.no_grad():
|
112 |
+
image_features = self.model.encode_image(image_input)
|
113 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
114 |
+
|
115 |
+
# 分析場景類型
|
116 |
+
scene_scores = self._analyze_scene_type(image_features)
|
117 |
+
|
118 |
+
# 分析光照條件
|
119 |
+
lighting_scores = self._analyze_lighting_condition(image_features)
|
120 |
+
|
121 |
+
# 文化場景的增強分析
|
122 |
+
cultural_analysis = {}
|
123 |
+
if include_cultural_analysis:
|
124 |
+
for scene_type in self.cultural_scene_prompts:
|
125 |
+
if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
|
126 |
+
cultural_analysis[scene_type] = self._analyze_cultural_scene(
|
127 |
+
image_features, scene_type
|
128 |
+
)
|
129 |
+
|
130 |
+
specialized_analysis = {}
|
131 |
+
for scene_type in self.specialized_scene_prompts:
|
132 |
+
if scene_type in scene_scores and scene_scores[scene_type] > 0.2:
|
133 |
+
specialized_analysis[scene_type] = self._analyze_specialized_scene(
|
134 |
+
image_features, scene_type
|
135 |
+
)
|
136 |
+
|
137 |
+
viewpoint_scores = self._analyze_viewpoint(image_features)
|
138 |
+
|
139 |
+
object_combination_scores = self._analyze_object_combinations(image_features)
|
140 |
+
|
141 |
+
activity_scores = self._analyze_activities(image_features)
|
142 |
+
|
143 |
+
# display results
|
144 |
+
result = {
|
145 |
+
"scene_scores": scene_scores,
|
146 |
+
"top_scene": max(scene_scores.items(), key=lambda x: x[1]),
|
147 |
+
"lighting_condition": max(lighting_scores.items(), key=lambda x: x[1]),
|
148 |
+
"embedding": image_features.cpu().numpy().tolist()[0] if self.device == "cuda" else image_features.numpy().tolist()[0],
|
149 |
+
"viewpoint": max(viewpoint_scores.items(), key=lambda x: x[1]),
|
150 |
+
"object_combinations": sorted(object_combination_scores.items(), key=lambda x: x[1], reverse=True)[:3],
|
151 |
+
"activities": sorted(activity_scores.items(), key=lambda x: x[1], reverse=True)[:3]
|
152 |
+
}
|
153 |
+
|
154 |
+
if cultural_analysis:
|
155 |
+
result["cultural_analysis"] = cultural_analysis
|
156 |
+
|
157 |
+
if specialized_analysis:
|
158 |
+
result["specialized_analysis"] = specialized_analysis
|
159 |
+
|
160 |
+
return result
|
161 |
+
|
162 |
+
except Exception as e:
|
163 |
+
print(f"Error analyzing image with CLIP: {e}")
|
164 |
+
import traceback
|
165 |
+
traceback.print_exc()
|
166 |
+
return {"error": str(e)}
|
167 |
+
|
168 |
+
def _analyze_scene_type(self, image_features: torch.Tensor) -> Dict[str, float]:
|
169 |
+
"""分析圖像特徵與各場景類型的相似度"""
|
170 |
+
with torch.no_grad():
|
171 |
+
# 計算場景類型文本特徵
|
172 |
+
text_features = self.model.encode_text(self.scene_type_tokens)
|
173 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
174 |
+
|
175 |
+
# 計算相似度分數
|
176 |
+
similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
|
177 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
178 |
+
|
179 |
+
# 建立場景分數字典
|
180 |
+
scene_scores = {}
|
181 |
+
for i, scene_type in enumerate(self.scene_type_prompts.keys()):
|
182 |
+
scene_scores[scene_type] = float(similarity[i])
|
183 |
+
|
184 |
+
return scene_scores
|
185 |
+
|
186 |
+
def _analyze_lighting_condition(self, image_features: torch.Tensor) -> Dict[str, float]:
|
187 |
+
"""分析圖像的光照條件"""
|
188 |
+
with torch.no_grad():
|
189 |
+
# 計算光照條件文本特徵
|
190 |
+
text_features = self.model.encode_text(self.lighting_tokens)
|
191 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
192 |
+
|
193 |
+
# 計算相似度分數
|
194 |
+
similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
|
195 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
196 |
+
|
197 |
+
# 建立光照條件分數字典
|
198 |
+
lighting_scores = {}
|
199 |
+
for i, lighting_type in enumerate(self.lighting_condition_prompts.keys()):
|
200 |
+
lighting_scores[lighting_type] = float(similarity[i])
|
201 |
+
|
202 |
+
return lighting_scores
|
203 |
+
|
204 |
+
def _analyze_cultural_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
|
205 |
+
"""針對特定文化場景進行深入分析"""
|
206 |
+
if scene_type not in self.cultural_tokens_dict:
|
207 |
+
return {"error": f"No cultural analysis available for {scene_type}"}
|
208 |
+
|
209 |
+
with torch.no_grad():
|
210 |
+
# 獲取特定文化場景的文本特徵
|
211 |
+
cultural_tokens = self.cultural_tokens_dict[scene_type]
|
212 |
+
text_features = self.model.encode_text(cultural_tokens)
|
213 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
214 |
+
|
215 |
+
# 計算相似度分數
|
216 |
+
similarity = (100 * image_features @ text_features.T)
|
217 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
218 |
+
|
219 |
+
# 找到最匹配的文化描述
|
220 |
+
prompts = self.cultural_scene_prompts[scene_type]
|
221 |
+
scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
|
222 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
223 |
+
|
224 |
+
return {
|
225 |
+
"best_description": scores[0][0],
|
226 |
+
"confidence": scores[0][1],
|
227 |
+
"all_matches": scores
|
228 |
+
}
|
229 |
+
|
230 |
+
def _analyze_specialized_scene(self, image_features: torch.Tensor, scene_type: str) -> Dict[str, Any]:
|
231 |
+
"""針對特定專門場景進行深入分析"""
|
232 |
+
if scene_type not in self.specialized_tokens_dict:
|
233 |
+
return {"error": f"No specialized analysis available for {scene_type}"}
|
234 |
+
|
235 |
+
with torch.no_grad():
|
236 |
+
# 獲取特定專門場景的文本特徵
|
237 |
+
specialized_tokens = self.specialized_tokens_dict[scene_type]
|
238 |
+
text_features = self.model.encode_text(specialized_tokens)
|
239 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
240 |
+
|
241 |
+
# 計算相似度分數
|
242 |
+
similarity = (100 * image_features @ text_features.T)
|
243 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
244 |
+
|
245 |
+
# 找到最匹配的專門描述
|
246 |
+
prompts = self.specialized_scene_prompts[scene_type]
|
247 |
+
scores = [(prompts[i], float(similarity[i])) for i in range(len(prompts))]
|
248 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
249 |
+
|
250 |
+
return {
|
251 |
+
"best_description": scores[0][0],
|
252 |
+
"confidence": scores[0][1],
|
253 |
+
"all_matches": scores
|
254 |
+
}
|
255 |
+
|
256 |
+
def _analyze_viewpoint(self, image_features: torch.Tensor) -> Dict[str, float]:
|
257 |
+
"""分析圖像的拍攝視角"""
|
258 |
+
with torch.no_grad():
|
259 |
+
# 計算視角文本特徵
|
260 |
+
text_features = self.model.encode_text(self.viewpoint_tokens)
|
261 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
262 |
+
|
263 |
+
# 計算相似度分數
|
264 |
+
similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
|
265 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
266 |
+
|
267 |
+
# 建立視角分數字典
|
268 |
+
viewpoint_scores = {}
|
269 |
+
for i, viewpoint in enumerate(self.viewpoint_prompts.keys()):
|
270 |
+
viewpoint_scores[viewpoint] = float(similarity[i])
|
271 |
+
|
272 |
+
return viewpoint_scores
|
273 |
+
|
274 |
+
def _analyze_object_combinations(self, image_features: torch.Tensor) -> Dict[str, float]:
|
275 |
+
"""分析圖像中的物體組合"""
|
276 |
+
with torch.no_grad():
|
277 |
+
# 計算物體組合文本特徵
|
278 |
+
text_features = self.model.encode_text(self.object_combination_tokens)
|
279 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
280 |
+
|
281 |
+
# 計算相似度分數
|
282 |
+
similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
|
283 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
284 |
+
|
285 |
+
# 建立物體組合分數字典
|
286 |
+
combination_scores = {}
|
287 |
+
for i, combination in enumerate(self.object_combination_prompts.keys()):
|
288 |
+
combination_scores[combination] = float(similarity[i])
|
289 |
+
|
290 |
+
return combination_scores
|
291 |
+
|
292 |
+
def _analyze_activities(self, image_features: torch.Tensor) -> Dict[str, float]:
|
293 |
+
"""分析圖像中的活動"""
|
294 |
+
with torch.no_grad():
|
295 |
+
# 計算活動文本特徵
|
296 |
+
text_features = self.model.encode_text(self.activity_tokens)
|
297 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
298 |
+
|
299 |
+
# 計算相似度分數
|
300 |
+
similarity = (100 * image_features @ text_features.T).softmax(dim=-1)
|
301 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
302 |
+
|
303 |
+
# 建立活動分數字典
|
304 |
+
activity_scores = {}
|
305 |
+
for i, activity in enumerate(self.activity_prompts.keys()):
|
306 |
+
activity_scores[activity] = float(similarity[i])
|
307 |
+
|
308 |
+
return activity_scores
|
309 |
+
|
310 |
+
def get_image_embedding(self, image) -> np.ndarray:
|
311 |
+
"""
|
312 |
+
獲取圖像的 CLIP 嵌入表示
|
313 |
+
|
314 |
+
Args:
|
315 |
+
image: PIL Image 或 numpy array
|
316 |
+
|
317 |
+
Returns:
|
318 |
+
np.ndarray: 圖像的 CLIP 特徵向量
|
319 |
+
"""
|
320 |
+
# 確保圖像是 PIL 格式
|
321 |
+
if not isinstance(image, Image.Image):
|
322 |
+
if isinstance(image, np.ndarray):
|
323 |
+
image = Image.fromarray(image)
|
324 |
+
else:
|
325 |
+
raise ValueError("Unsupported image format. Expected PIL Image or numpy array.")
|
326 |
+
|
327 |
+
# 預處理並編碼
|
328 |
+
image_input = self.preprocess(image).unsqueeze(0).to(self.device)
|
329 |
+
|
330 |
+
with torch.no_grad():
|
331 |
+
image_features = self.model.encode_image(image_input)
|
332 |
+
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
|
333 |
+
|
334 |
+
# 轉換為 numpy 並返回
|
335 |
+
return image_features.cpu().numpy()[0] if self.device == "cuda" else image_features.numpy()[0]
|
336 |
+
|
337 |
+
def text_to_embedding(self, text: str) -> np.ndarray:
|
338 |
+
"""
|
339 |
+
將文本轉換為 CLIP 嵌入表示
|
340 |
+
|
341 |
+
Args:
|
342 |
+
text: 輸入文本
|
343 |
+
|
344 |
+
Returns:
|
345 |
+
np.ndarray: 文本的 CLIP 特徵向量
|
346 |
+
"""
|
347 |
+
text_token = clip.tokenize([text]).to(self.device)
|
348 |
+
|
349 |
+
with torch.no_grad():
|
350 |
+
text_features = self.model.encode_text(text_token)
|
351 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
352 |
+
|
353 |
+
return text_features.cpu().numpy()[0] if self.device == "cuda" else text_features.numpy()[0]
|
354 |
+
|
355 |
+
def calculate_similarity(self, image, text_queries: List[str]) -> Dict[str, float]:
|
356 |
+
"""
|
357 |
+
計算圖像與多個文本查詢的相似度
|
358 |
+
|
359 |
+
Args:
|
360 |
+
image: PIL Image 或 numpy array
|
361 |
+
text_queries: 文本查詢列表
|
362 |
+
|
363 |
+
Returns:
|
364 |
+
Dict: 每個查詢的相似度分數
|
365 |
+
"""
|
366 |
+
# 獲取圖像嵌入
|
367 |
+
if isinstance(image, np.ndarray) and len(image.shape) == 1:
|
368 |
+
# 已經是嵌入向量
|
369 |
+
image_features = torch.tensor(image).unsqueeze(0).to(self.device)
|
370 |
+
else:
|
371 |
+
# 是圖像,需要提取嵌入
|
372 |
+
image_features = torch.tensor(self.get_image_embedding(image)).unsqueeze(0).to(self.device)
|
373 |
+
|
374 |
+
# calulate similarity
|
375 |
+
text_tokens = clip.tokenize(text_queries).to(self.device)
|
376 |
+
|
377 |
+
with torch.no_grad():
|
378 |
+
text_features = self.model.encode_text(text_tokens)
|
379 |
+
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
|
380 |
+
|
381 |
+
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
|
382 |
+
similarity = similarity.cpu().numpy()[0] if self.device == "cuda" else similarity.numpy()[0]
|
383 |
+
|
384 |
+
# display results
|
385 |
+
result = {}
|
386 |
+
for i, query in enumerate(text_queries):
|
387 |
+
result[query] = float(similarity[i])
|
388 |
+
|
389 |
+
return result
|
clip_prompts.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# 場景類型提示
|
3 |
+
SCENE_TYPE_PROMPTS = {
|
4 |
+
# 基本室內場景
|
5 |
+
"living_room": "A photo of a living room with furniture and entertainment systems.",
|
6 |
+
"bedroom": "A photo of a bedroom with a bed and personal items.",
|
7 |
+
"dining_area": "A photo of a dining area with a table and chairs for meals.",
|
8 |
+
"kitchen": "A photo of a kitchen with cooking appliances and food preparation areas.",
|
9 |
+
"office_workspace": "A photo of an office workspace with desk, computer and work equipment.",
|
10 |
+
"meeting_room": "A photo of a meeting room with a conference table and multiple chairs.",
|
11 |
+
|
12 |
+
# 基本室外/城市場景
|
13 |
+
"city_street": "A photo of a city street with traffic, pedestrians and urban buildings.",
|
14 |
+
"parking_lot": "A photo of a parking lot with multiple parked vehicles.",
|
15 |
+
"park_area": "A photo of a park or recreational area with greenery and outdoor facilities.",
|
16 |
+
"retail_store": "A photo of a retail store with merchandise displays and shopping areas.",
|
17 |
+
"supermarket": "A photo of a supermarket with food items, aisles and shopping carts.",
|
18 |
+
|
19 |
+
# 特殊室內場景
|
20 |
+
"upscale_dining": "A photo of an upscale dining area with elegant furniture and refined decor.",
|
21 |
+
"conference_room": "A photo of a professional conference room with presentation equipment and seating.",
|
22 |
+
"classroom": "A photo of a classroom with desks, chairs and educational equipment.",
|
23 |
+
"library": "A photo of a library with bookshelves, reading areas and study spaces.",
|
24 |
+
|
25 |
+
# 亞洲特色場景
|
26 |
+
"asian_commercial_street": "A photo of an Asian commercial street with dense signage, shops and pedestrians.",
|
27 |
+
"asian_night_market": "A photo of an Asian night market with food stalls, crowds and colorful lights.",
|
28 |
+
"asian_temple_area": "A photo of an Asian temple with traditional architecture and cultural elements.",
|
29 |
+
|
30 |
+
# 交通相關場景
|
31 |
+
"financial_district": "A photo of a financial district with tall office buildings and business activity.",
|
32 |
+
"urban_intersection": "A photo of an urban intersection with crosswalks, traffic lights and pedestrians crossing.",
|
33 |
+
"transit_hub": "A photo of a transportation hub with multiple modes of public transit and passengers.",
|
34 |
+
"bus_stop": "A photo of a bus stop with people waiting and buses arriving or departing.",
|
35 |
+
"bus_station": "A photo of a bus terminal with multiple buses and traveler facilities.",
|
36 |
+
"train_station": "A photo of a train station with platforms, trains and passenger activity.",
|
37 |
+
"airport": "A photo of an airport with planes, terminals and traveler activity.",
|
38 |
+
|
39 |
+
# 商業場景
|
40 |
+
"shopping_district": "A photo of a shopping district with multiple retail stores and consumer activity.",
|
41 |
+
"cafe": "A photo of a cafe with coffee service, seating and casual dining.",
|
42 |
+
"restaurant": "A photo of a restaurant with dining tables, food service and eating areas.",
|
43 |
+
|
44 |
+
# 空中視角場景
|
45 |
+
"aerial_view_intersection": "An aerial view of an intersection showing crosswalks and traffic patterns from above.",
|
46 |
+
"aerial_view_commercial_area": "An aerial view of a commercial area showing shopping districts from above.",
|
47 |
+
"aerial_view_plaza": "An aerial view of a public plaza or square showing patterns of people movement from above.",
|
48 |
+
|
49 |
+
# 娛樂場景
|
50 |
+
"zoo": "A photo of a zoo with animal enclosures, exhibits and visitors.",
|
51 |
+
"playground": "A photo of a playground with recreational equipment and children playing.",
|
52 |
+
"sports_field": "A photo of a sports field with playing surfaces and athletic equipment.",
|
53 |
+
"sports_stadium": "A photo of a sports stadium with spectator seating and athletic facilities.",
|
54 |
+
|
55 |
+
# 水相關場景
|
56 |
+
"harbor": "A photo of a harbor with boats, docks and waterfront activity.",
|
57 |
+
"beach_water_recreation": "A photo of a beach area with water activities, sand and recreational equipment like surfboards.",
|
58 |
+
|
59 |
+
# 文化時間特定場景
|
60 |
+
"nighttime_street": "A photo of a street at night with artificial lighting and evening activity.",
|
61 |
+
"nighttime_commercial_district": "A photo of a commercial district at night with illuminated signs and evening shopping.",
|
62 |
+
"european_plaza": "A photo of a European-style plaza with historic architecture and public gathering spaces.",
|
63 |
+
|
64 |
+
# 混合環境場景
|
65 |
+
"indoor_outdoor_cafe": "A photo of a cafe with both indoor seating and outdoor patio areas.",
|
66 |
+
"transit_station_platform": "A photo of a transit station platform with waiting areas and arriving vehicles.",
|
67 |
+
|
68 |
+
# 工作場景
|
69 |
+
"construction_site": "A photo of a construction site with building materials, equipment and workers.",
|
70 |
+
"medical_facility": "A photo of a medical facility with healthcare equipment and professional staff.",
|
71 |
+
"educational_setting": "A photo of an educational setting with learning spaces and academic resources.",
|
72 |
+
"professional_kitchen": "A photo of a professional commercial kitchen with industrial cooking equipment and food preparation stations."
|
73 |
+
}
|
74 |
+
|
75 |
+
# 文化特定場景提示
|
76 |
+
CULTURAL_SCENE_PROMPTS = {
|
77 |
+
"asian_commercial_street": [
|
78 |
+
"A busy Asian shopping street with neon signs and dense storefronts.",
|
79 |
+
"A commercial street in Asia with multi-level signage and narrow walkways.",
|
80 |
+
"A street scene in Taiwan or Hong Kong with vertical signage and compact shops.",
|
81 |
+
"A crowded commercial alley in an Asian city with signs in Chinese characters.",
|
82 |
+
"A narrow shopping street in Asia with small shops on both sides.",
|
83 |
+
"An outdoor shopping district in an East Asian city with electronic billboards.",
|
84 |
+
"A bustling commercial street in Taiwan with food vendors and retail shops.",
|
85 |
+
"A pedestrian shopping area with Korean or Chinese signs and storefronts.",
|
86 |
+
"A daytime shopping street in an Asian urban center with vertical development."
|
87 |
+
],
|
88 |
+
"asian_night_market": [
|
89 |
+
"A vibrant night market in Asia with food stalls and large crowds.",
|
90 |
+
"An evening street market in Taiwan with street food vendors and bright lights.",
|
91 |
+
"A busy night bazaar in Asia with illuminated stalls and local food.",
|
92 |
+
"A crowded night street food market in an Asian city with vendor carts.",
|
93 |
+
"An Asian night market with steam from cooking food and hanging lanterns.",
|
94 |
+
"A nocturnal food street in East Asia with vendor canopies and neon lights.",
|
95 |
+
"A bustling evening market with rows of food stalls and plastic stools.",
|
96 |
+
"A lively Asian street food scene at night with cooking stations and crowds."
|
97 |
+
],
|
98 |
+
"asian_temple_area": [
|
99 |
+
"A traditional Asian temple with ornate roof details and religious symbols.",
|
100 |
+
"A Buddhist temple complex in East Asia with multiple pavilions and prayer areas.",
|
101 |
+
"A sacred site in Asia with incense burners and ceremonial elements.",
|
102 |
+
"A temple courtyard with stone statues and traditional Asian architecture.",
|
103 |
+
"A spiritual center in East Asia with pagoda-style structures and visitors.",
|
104 |
+
"An ancient temple site with Asian architectural elements and cultural symbols.",
|
105 |
+
"A religious compound with characteristic Asian roof curves and decorative features."
|
106 |
+
],
|
107 |
+
"european_plaza": [
|
108 |
+
"A historic European city square with classical architecture and cafes.",
|
109 |
+
"An old-world plaza in Europe with cobblestone paving and historic buildings.",
|
110 |
+
"A public square in a European city with fountains and surrounding architecture.",
|
111 |
+
"A central plaza in Europe with outdoor seating areas and historic monuments.",
|
112 |
+
"A traditional European town square with surrounding shops and restaurants.",
|
113 |
+
"A historic gathering space in Europe with distinctive architecture and pedestrians."
|
114 |
+
]
|
115 |
+
}
|
116 |
+
|
117 |
+
# 對比類別提示
|
118 |
+
COMPARATIVE_PROMPTS = {
|
119 |
+
"indoor_vs_outdoor": [
|
120 |
+
"An indoor shopping mall corridor with controlled lighting and storefronts.",
|
121 |
+
"An outdoor commercial street with natural lighting and urban storefronts.",
|
122 |
+
"An enclosed shopping gallery with artificial lighting and climate control.",
|
123 |
+
"An open-air market street with natural light and weather exposure."
|
124 |
+
],
|
125 |
+
"professional_vs_home": [
|
126 |
+
"A professional commercial kitchen with stainless steel equipment and workstations.",
|
127 |
+
"A home kitchen with residential appliances and family cooking space.",
|
128 |
+
"A restaurant kitchen with multiple cooking stations and chef activity.",
|
129 |
+
"A family kitchen with standard household equipment and personal touches."
|
130 |
+
],
|
131 |
+
"sports_venue_vs_park": [
|
132 |
+
"A professional sports stadium with designated playing areas and audience seating.",
|
133 |
+
"A public park with casual recreation space and community greenery.",
|
134 |
+
"An athletic venue with specialized sports equipment and competitive playing surfaces.",
|
135 |
+
"An outdoor community space with general purpose areas and natural elements."
|
136 |
+
],
|
137 |
+
"asian_vs_western_commercial": [
|
138 |
+
"An Asian shopping street with vertical signage and compact multi-level shops.",
|
139 |
+
"A Western commercial street with horizontal storefronts and wider sidewalks.",
|
140 |
+
"An East Asian retail area with dense signage in Asian scripts and narrow walkways.",
|
141 |
+
"A Western shopping district with uniform building heights and Latin alphabetic signs."
|
142 |
+
],
|
143 |
+
"daytime_vs_nighttime": [
|
144 |
+
"A daytime urban scene with natural sunlight illuminating streets and buildings.",
|
145 |
+
"A nighttime city scene with artificial lighting from stores, signs and streetlights.",
|
146 |
+
"A commercial district during daylight hours with natural shadows and visibility.",
|
147 |
+
"An evening urban setting with illuminated storefronts and light patterns on streets."
|
148 |
+
],
|
149 |
+
"aerial_vs_street_level": [
|
150 |
+
"An aerial view showing urban patterns and layouts from above.",
|
151 |
+
"A street-level view showing pedestrian perspective and immediate surroundings.",
|
152 |
+
"A bird's-eye view of city organization and movement patterns from high above.",
|
153 |
+
"An eye-level perspective showing direct human interaction with urban elements."
|
154 |
+
]
|
155 |
+
}
|
156 |
+
|
157 |
+
# 環境條件文本提示
|
158 |
+
LIGHTING_CONDITION_PROMPTS = {
|
159 |
+
"day_clear": "A photo taken during daytime with clear skies and direct sunlight.",
|
160 |
+
"day_cloudy": "A photo taken during daytime with overcast conditions and diffused light.",
|
161 |
+
"sunset/sunrise": "A photo taken during sunset or sunrise with warm golden lighting and long shadows.",
|
162 |
+
"night": "A photo taken at night with minimal natural light and artificial illumination.",
|
163 |
+
"indoor_bright": "An indoor photo with bright, even artificial lighting throughout the space.",
|
164 |
+
"indoor_moderate": "An indoor photo with moderate lighting creating a balanced indoor atmosphere.",
|
165 |
+
"indoor_dim": "An indoor photo with low lighting levels creating a subdued environment.",
|
166 |
+
"neon_night": "A night scene with colorful neon lighting creating vibrant illumination patterns.",
|
167 |
+
"indoor_commercial": "An indoor retail environment with directed display lighting highlighting products.",
|
168 |
+
"indoor_restaurant": "An indoor dining space with ambient mood lighting for atmosphere.",
|
169 |
+
"stadium_lighting": "A sports venue with powerful floodlights creating intense, even illumination.",
|
170 |
+
"mixed_lighting": "A scene with combined natural and artificial light sources creating transition zones.",
|
171 |
+
"beach_daylight": "A photo taken at a beach with bright natural sunlight and reflections from water.",
|
172 |
+
"sports_arena_lighting": "A photo of a sports venue illuminated by powerful overhead lighting systems.",
|
173 |
+
"kitchen_task_lighting": "A photo of a kitchen with focused lighting concentrated on work surfaces."
|
174 |
+
}
|
175 |
+
|
176 |
+
# 針對新場景類型的特殊提示
|
177 |
+
SPECIALIZED_SCENE_PROMPTS = {
|
178 |
+
"beach_water_recreation": [
|
179 |
+
"A coastal beach scene with people surfing and sunbathing on sandy shores.",
|
180 |
+
"Active water sports participants at a beach with surfboards and swimming areas.",
|
181 |
+
"A sunny beach destination with recreational water equipment and beachgoers.",
|
182 |
+
"A shoreline recreation area with surf gear and coastal activities.",
|
183 |
+
"An oceanfront scene with people engaging in water sports and beach leisure.",
|
184 |
+
"A popular beach spot with swimming areas and surfing zones.",
|
185 |
+
"A coastal recreation setting with beach umbrellas and water activities."
|
186 |
+
],
|
187 |
+
"sports_venue": [
|
188 |
+
"An indoor sports arena with professional equipment and competition spaces.",
|
189 |
+
"A sports stadium with marked playing areas and spectator seating arrangement.",
|
190 |
+
"A specialized athletic venue with competition equipment and performance areas.",
|
191 |
+
"A professional sports facility with game-related apparatus and audience zones.",
|
192 |
+
"An organized sports center with competitive play areas and athletic equipment.",
|
193 |
+
"A competition venue with sport-specific markings and professional setup.",
|
194 |
+
"A formal athletic facility with standardized equipment and playing surfaces."
|
195 |
+
],
|
196 |
+
"professional_kitchen": [
|
197 |
+
"A commercial restaurant kitchen with multiple cooking stations and food prep areas.",
|
198 |
+
"A professional culinary workspace with industrial appliances and chef activity.",
|
199 |
+
"A busy restaurant back-of-house with stainless steel equipment and meal preparation.",
|
200 |
+
"A commercial food service kitchen with chef workstations and specialized zones.",
|
201 |
+
"An industrial kitchen facility with specialized cooking equipment and prep surfaces.",
|
202 |
+
"A high-volume food production kitchen with professional-grade appliances.",
|
203 |
+
"A restaurant kitchen with distinct cooking areas and culinary workflow design."
|
204 |
+
],
|
205 |
+
"urban_intersection": [
|
206 |
+
"A city intersection with crosswalks and traffic signals controlling movement.",
|
207 |
+
"A busy urban crossroad with pedestrian crossings and vehicle traffic.",
|
208 |
+
"A regulated street intersection with crosswalk markings and waiting pedestrians.",
|
209 |
+
"A metropolitan junction with traffic lights and pedestrian crossing zones.",
|
210 |
+
"A city street crossing with safety features for pedestrians and traffic flow.",
|
211 |
+
"A controlled urban intersection with movement patterns for vehicles and people.",
|
212 |
+
"A city center crossroad with traffic management features and pedestrian areas."
|
213 |
+
],
|
214 |
+
"financial_district": [
|
215 |
+
"A downtown business area with tall office buildings and commercial activity.",
|
216 |
+
"An urban financial center with skyscrapers and professional environment.",
|
217 |
+
"A city's business district with corporate headquarters and office towers.",
|
218 |
+
"A metropolitan financial zone with high-rise buildings and business traffic.",
|
219 |
+
"A corporate district in a city center with professional architecture.",
|
220 |
+
"An urban area dominated by office buildings and business establishments.",
|
221 |
+
"A city's economic center with banking institutions and corporate offices."
|
222 |
+
],
|
223 |
+
"aerial_view_intersection": [
|
224 |
+
"A bird's-eye view of a city intersection showing crossing patterns from above.",
|
225 |
+
"An overhead perspective of an urban crossroad showing traffic organization.",
|
226 |
+
"A top-down view of a street intersection revealing pedestrian crosswalks.",
|
227 |
+
"An aerial shot of a city junction showing the layout of roads and crossings.",
|
228 |
+
"A high-angle view of an intersection showing traffic and pedestrian flow patterns.",
|
229 |
+
"A drone perspective of urban crossing design viewed from directly above.",
|
230 |
+
"A vertical view of a street intersection showing crossing infrastructure."
|
231 |
+
]
|
232 |
+
}
|
233 |
+
|
234 |
+
VIEWPOINT_PROMPTS = {
|
235 |
+
"eye_level": "A photo taken from normal human eye level showing a direct frontal perspective.",
|
236 |
+
"aerial": "A photo taken from high above looking directly down at the scene below.",
|
237 |
+
"elevated": "A photo taken from a higher than normal position looking down at an angle.",
|
238 |
+
"low_angle": "A photo taken from a low position looking upward at the scene.",
|
239 |
+
"bird_eye": "A photo taken from very high above showing a complete overhead perspective.",
|
240 |
+
"street_level": "A photo taken from the perspective of someone standing on the street.",
|
241 |
+
"interior": "A photo taken from inside a building showing the internal environment.",
|
242 |
+
"vehicular": "A photo taken from inside or mounted on a moving vehicle."
|
243 |
+
}
|
244 |
+
|
245 |
+
OBJECT_COMBINATION_PROMPTS = {
|
246 |
+
"dining_setting": "A scene with tables, chairs, plates, and eating utensils arranged for meals.",
|
247 |
+
"office_setup": "A scene with desks, chairs, computers, and office supplies for work.",
|
248 |
+
"living_space": "A scene with sofas, coffee tables, TVs, and comfortable seating arrangements.",
|
249 |
+
"transportation_hub": "A scene with vehicles, waiting areas, passengers, and transit information.",
|
250 |
+
"retail_environment": "A scene with merchandise displays, shoppers, and store fixtures.",
|
251 |
+
"crosswalk_scene": "A scene with street markings, pedestrians crossing, and traffic signals.",
|
252 |
+
"cooking_area": "A scene with stoves, prep surfaces, cooking utensils, and food items.",
|
253 |
+
"recreational_space": "A scene with sports equipment, play areas, and activity participants."
|
254 |
+
}
|
255 |
+
|
256 |
+
ACTIVITY_PROMPTS = {
|
257 |
+
"shopping": "People looking at merchandise, carrying shopping bags, and browsing stores.",
|
258 |
+
"dining": "People eating food, sitting at tables, and using dining utensils.",
|
259 |
+
"commuting": "People waiting for transportation, boarding vehicles, and traveling.",
|
260 |
+
"working": "People using computers, attending meetings, and engaged in professional tasks.",
|
261 |
+
"exercising": "People engaged in physical activities, using sports equipment, and training.",
|
262 |
+
"cooking": "People preparing food, using kitchen equipment, and creating meals.",
|
263 |
+
"crossing_street": "People walking across designated crosswalks and navigating intersections.",
|
264 |
+
"recreational_activity": "People engaged in leisure activities, games, and social recreation."
|
265 |
+
}
|
color_mapper.py
CHANGED
@@ -6,7 +6,7 @@ class ColorMapper:
|
|
6 |
A class for consistent color mapping of object detection classes
|
7 |
Provides color schemes for visualization in both RGB and hex formats
|
8 |
"""
|
9 |
-
|
10 |
# Class categories for better organization
|
11 |
CATEGORIES = {
|
12 |
"person": [0],
|
@@ -21,8 +21,9 @@ class ColorMapper:
|
|
21 |
"electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
|
22 |
"household": [71, 72, 73, 74, 75, 76, 77, 78, 79]
|
23 |
}
|
24 |
-
|
25 |
# Base colors for each category (in HSV for easier variation)
|
|
|
26 |
CATEGORY_COLORS = {
|
27 |
"person": (0, 0.8, 0.9), # Red
|
28 |
"vehicles": (210, 0.8, 0.9), # Blue
|
@@ -36,43 +37,43 @@ class ColorMapper:
|
|
36 |
"electronics": (240, 0.6, 0.9), # Light Blue
|
37 |
"household": (60, 0.6, 0.9) # Yellow
|
38 |
}
|
39 |
-
|
40 |
def __init__(self):
|
41 |
"""Initialize the ColorMapper with COCO class mappings"""
|
42 |
self.class_names = self._get_coco_classes()
|
43 |
self.color_map = self._generate_color_map()
|
44 |
-
|
45 |
def _get_coco_classes(self) -> Dict[int, str]:
|
46 |
"""Get the standard COCO class names with their IDs"""
|
47 |
return {
|
48 |
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
|
49 |
5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
|
50 |
-
10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
|
51 |
14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
|
52 |
20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
|
53 |
25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
|
54 |
30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
|
55 |
-
35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
|
56 |
39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
|
57 |
-
44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
|
58 |
49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
|
59 |
-
54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
|
60 |
59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
|
61 |
-
64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
|
62 |
69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
|
63 |
-
74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
|
64 |
79: 'toothbrush'
|
65 |
}
|
66 |
-
|
67 |
def _hsv_to_rgb(self, h: float, s: float, v: float) -> Tuple[int, int, int]:
|
68 |
"""
|
69 |
Convert HSV color to RGB
|
70 |
-
|
71 |
Args:
|
72 |
h: Hue (0-360)
|
73 |
s: Saturation (0-1)
|
74 |
v: Value (0-1)
|
75 |
-
|
76 |
Returns:
|
77 |
Tuple of (R, G, B) values (0-255)
|
78 |
"""
|
@@ -82,7 +83,7 @@ class ColorMapper:
|
|
82 |
p = v * (1 - s)
|
83 |
q = v * (1 - s * f)
|
84 |
t = v * (1 - s * (1 - f))
|
85 |
-
|
86 |
if i == 0:
|
87 |
r, g, b = v, t, p
|
88 |
elif i == 1:
|
@@ -95,28 +96,28 @@ class ColorMapper:
|
|
95 |
r, g, b = t, p, v
|
96 |
else:
|
97 |
r, g, b = v, p, q
|
98 |
-
|
99 |
return (int(r * 255), int(g * 255), int(b * 255))
|
100 |
-
|
101 |
def _rgb_to_hex(self, rgb: Tuple[int, int, int]) -> str:
|
102 |
"""
|
103 |
Convert RGB color to hex color code
|
104 |
-
|
105 |
Args:
|
106 |
rgb: Tuple of (R, G, B) values (0-255)
|
107 |
-
|
108 |
Returns:
|
109 |
Hex color code (e.g. '#FF0000')
|
110 |
"""
|
111 |
return f'#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}'
|
112 |
-
|
113 |
def _find_category(self, class_id: int) -> str:
|
114 |
"""
|
115 |
Find the category for a given class ID
|
116 |
-
|
117 |
Args:
|
118 |
class_id: Class ID (0-79)
|
119 |
-
|
120 |
Returns:
|
121 |
Category name
|
122 |
"""
|
@@ -124,11 +125,11 @@ class ColorMapper:
|
|
124 |
if class_id in ids:
|
125 |
return category
|
126 |
return "other" # Fallback
|
127 |
-
|
128 |
def _generate_color_map(self) -> Dict:
|
129 |
"""
|
130 |
Generate a color map for all 80 COCO classes
|
131 |
-
|
132 |
Returns:
|
133 |
Dictionary mapping class IDs and names to color values
|
134 |
"""
|
@@ -137,7 +138,7 @@ class ColorMapper:
|
|
137 |
'by_name': {}, # Map class name to RGB and hex
|
138 |
'categories': {} # Map category to base color
|
139 |
}
|
140 |
-
|
141 |
# Generate colors for categories
|
142 |
for category, hsv in self.CATEGORY_COLORS.items():
|
143 |
rgb = self._hsv_to_rgb(hsv[0], hsv[1], hsv[2])
|
@@ -146,54 +147,54 @@ class ColorMapper:
|
|
146 |
'rgb': rgb,
|
147 |
'hex': hex_color
|
148 |
}
|
149 |
-
|
150 |
# Generate variations for each class within a category
|
151 |
for class_id, class_name in self.class_names.items():
|
152 |
category = self._find_category(class_id)
|
153 |
base_hsv = self.CATEGORY_COLORS.get(category, (0, 0, 0.8)) # Default gray
|
154 |
-
|
155 |
# Slightly vary the hue and saturation within the category
|
156 |
ids_in_category = self.CATEGORIES.get(category, [])
|
157 |
if ids_in_category:
|
158 |
position = ids_in_category.index(class_id) if class_id in ids_in_category else 0
|
159 |
variation = position / max(1, len(ids_in_category) - 1) # 0 to 1
|
160 |
-
|
161 |
# Vary hue slightly (±15°) and saturation
|
162 |
h_offset = 30 * variation - 15 # -15 to +15
|
163 |
s_offset = 0.2 * variation # 0 to 0.2
|
164 |
-
|
165 |
h = (base_hsv[0] + h_offset) % 360
|
166 |
s = min(1.0, base_hsv[1] + s_offset)
|
167 |
v = base_hsv[2]
|
168 |
else:
|
169 |
h, s, v = base_hsv
|
170 |
-
|
171 |
rgb = self._hsv_to_rgb(h, s, v)
|
172 |
hex_color = self._rgb_to_hex(rgb)
|
173 |
-
|
174 |
# Store in both mappings
|
175 |
color_map['by_id'][class_id] = {
|
176 |
'rgb': rgb,
|
177 |
'hex': hex_color,
|
178 |
'category': category
|
179 |
}
|
180 |
-
|
181 |
color_map['by_name'][class_name] = {
|
182 |
'rgb': rgb,
|
183 |
'hex': hex_color,
|
184 |
'category': category
|
185 |
}
|
186 |
-
|
187 |
return color_map
|
188 |
-
|
189 |
def get_color(self, class_identifier: Union[int, str], format: str = 'hex') -> Any:
|
190 |
"""
|
191 |
Get color for a specific class
|
192 |
-
|
193 |
Args:
|
194 |
class_identifier: Class ID (int) or name (str)
|
195 |
format: Color format ('hex', 'rgb', or 'bgr')
|
196 |
-
|
197 |
Returns:
|
198 |
Color in requested format
|
199 |
"""
|
@@ -202,11 +203,11 @@ class ColorMapper:
|
|
202 |
color_info = self.color_map['by_id'].get(class_identifier)
|
203 |
else:
|
204 |
color_info = self.color_map['by_name'].get(class_identifier)
|
205 |
-
|
206 |
if not color_info:
|
207 |
# Fallback color if not found
|
208 |
return '#CCCCCC' if format == 'hex' else (204, 204, 204)
|
209 |
-
|
210 |
if format == 'hex':
|
211 |
return color_info['hex']
|
212 |
elif format == 'rgb':
|
@@ -217,14 +218,14 @@ class ColorMapper:
|
|
217 |
return (b, g, r)
|
218 |
else:
|
219 |
return color_info['rgb']
|
220 |
-
|
221 |
def get_all_colors(self, format: str = 'hex') -> Dict:
|
222 |
"""
|
223 |
Get all colors in the specified format
|
224 |
-
|
225 |
Args:
|
226 |
format: Color format ('hex', 'rgb', or 'bgr')
|
227 |
-
|
228 |
Returns:
|
229 |
Dictionary mapping class names to colors
|
230 |
"""
|
@@ -232,14 +233,14 @@ class ColorMapper:
|
|
232 |
for class_id, class_name in self.class_names.items():
|
233 |
result[class_name] = self.get_color(class_id, format)
|
234 |
return result
|
235 |
-
|
236 |
def get_category_colors(self, format: str = 'hex') -> Dict:
|
237 |
"""
|
238 |
Get base colors for each category
|
239 |
-
|
240 |
Args:
|
241 |
format: Color format ('hex', 'rgb', or 'bgr')
|
242 |
-
|
243 |
Returns:
|
244 |
Dictionary mapping categories to colors
|
245 |
"""
|
@@ -253,14 +254,14 @@ class ColorMapper:
|
|
253 |
else:
|
254 |
result[category] = color_info['rgb']
|
255 |
return result
|
256 |
-
|
257 |
def get_category_for_class(self, class_identifier: Union[int, str]) -> str:
|
258 |
"""
|
259 |
Get the category for a specific class
|
260 |
-
|
261 |
Args:
|
262 |
class_identifier: Class ID (int) or name (str)
|
263 |
-
|
264 |
Returns:
|
265 |
Category name
|
266 |
"""
|
|
|
6 |
A class for consistent color mapping of object detection classes
|
7 |
Provides color schemes for visualization in both RGB and hex formats
|
8 |
"""
|
9 |
+
|
10 |
# Class categories for better organization
|
11 |
CATEGORIES = {
|
12 |
"person": [0],
|
|
|
21 |
"electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
|
22 |
"household": [71, 72, 73, 74, 75, 76, 77, 78, 79]
|
23 |
}
|
24 |
+
|
25 |
# Base colors for each category (in HSV for easier variation)
|
26 |
+
# HSV: Hue, Saturation, Value
|
27 |
CATEGORY_COLORS = {
|
28 |
"person": (0, 0.8, 0.9), # Red
|
29 |
"vehicles": (210, 0.8, 0.9), # Blue
|
|
|
37 |
"electronics": (240, 0.6, 0.9), # Light Blue
|
38 |
"household": (60, 0.6, 0.9) # Yellow
|
39 |
}
|
40 |
+
|
41 |
def __init__(self):
|
42 |
"""Initialize the ColorMapper with COCO class mappings"""
|
43 |
self.class_names = self._get_coco_classes()
|
44 |
self.color_map = self._generate_color_map()
|
45 |
+
|
46 |
def _get_coco_classes(self) -> Dict[int, str]:
|
47 |
"""Get the standard COCO class names with their IDs"""
|
48 |
return {
|
49 |
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
|
50 |
5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
|
51 |
+
10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
|
52 |
14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
|
53 |
20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
|
54 |
25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
|
55 |
30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
|
56 |
+
35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket',
|
57 |
39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife',
|
58 |
+
44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich',
|
59 |
49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza',
|
60 |
+
54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant',
|
61 |
59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop',
|
62 |
+
64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
|
63 |
69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book',
|
64 |
+
74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier',
|
65 |
79: 'toothbrush'
|
66 |
}
|
67 |
+
|
68 |
def _hsv_to_rgb(self, h: float, s: float, v: float) -> Tuple[int, int, int]:
|
69 |
"""
|
70 |
Convert HSV color to RGB
|
71 |
+
|
72 |
Args:
|
73 |
h: Hue (0-360)
|
74 |
s: Saturation (0-1)
|
75 |
v: Value (0-1)
|
76 |
+
|
77 |
Returns:
|
78 |
Tuple of (R, G, B) values (0-255)
|
79 |
"""
|
|
|
83 |
p = v * (1 - s)
|
84 |
q = v * (1 - s * f)
|
85 |
t = v * (1 - s * (1 - f))
|
86 |
+
|
87 |
if i == 0:
|
88 |
r, g, b = v, t, p
|
89 |
elif i == 1:
|
|
|
96 |
r, g, b = t, p, v
|
97 |
else:
|
98 |
r, g, b = v, p, q
|
99 |
+
|
100 |
return (int(r * 255), int(g * 255), int(b * 255))
|
101 |
+
|
102 |
def _rgb_to_hex(self, rgb: Tuple[int, int, int]) -> str:
|
103 |
"""
|
104 |
Convert RGB color to hex color code
|
105 |
+
|
106 |
Args:
|
107 |
rgb: Tuple of (R, G, B) values (0-255)
|
108 |
+
|
109 |
Returns:
|
110 |
Hex color code (e.g. '#FF0000')
|
111 |
"""
|
112 |
return f'#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}'
|
113 |
+
|
114 |
def _find_category(self, class_id: int) -> str:
|
115 |
"""
|
116 |
Find the category for a given class ID
|
117 |
+
|
118 |
Args:
|
119 |
class_id: Class ID (0-79)
|
120 |
+
|
121 |
Returns:
|
122 |
Category name
|
123 |
"""
|
|
|
125 |
if class_id in ids:
|
126 |
return category
|
127 |
return "other" # Fallback
|
128 |
+
|
129 |
def _generate_color_map(self) -> Dict:
|
130 |
"""
|
131 |
Generate a color map for all 80 COCO classes
|
132 |
+
|
133 |
Returns:
|
134 |
Dictionary mapping class IDs and names to color values
|
135 |
"""
|
|
|
138 |
'by_name': {}, # Map class name to RGB and hex
|
139 |
'categories': {} # Map category to base color
|
140 |
}
|
141 |
+
|
142 |
# Generate colors for categories
|
143 |
for category, hsv in self.CATEGORY_COLORS.items():
|
144 |
rgb = self._hsv_to_rgb(hsv[0], hsv[1], hsv[2])
|
|
|
147 |
'rgb': rgb,
|
148 |
'hex': hex_color
|
149 |
}
|
150 |
+
|
151 |
# Generate variations for each class within a category
|
152 |
for class_id, class_name in self.class_names.items():
|
153 |
category = self._find_category(class_id)
|
154 |
base_hsv = self.CATEGORY_COLORS.get(category, (0, 0, 0.8)) # Default gray
|
155 |
+
|
156 |
# Slightly vary the hue and saturation within the category
|
157 |
ids_in_category = self.CATEGORIES.get(category, [])
|
158 |
if ids_in_category:
|
159 |
position = ids_in_category.index(class_id) if class_id in ids_in_category else 0
|
160 |
variation = position / max(1, len(ids_in_category) - 1) # 0 to 1
|
161 |
+
|
162 |
# Vary hue slightly (±15°) and saturation
|
163 |
h_offset = 30 * variation - 15 # -15 to +15
|
164 |
s_offset = 0.2 * variation # 0 to 0.2
|
165 |
+
|
166 |
h = (base_hsv[0] + h_offset) % 360
|
167 |
s = min(1.0, base_hsv[1] + s_offset)
|
168 |
v = base_hsv[2]
|
169 |
else:
|
170 |
h, s, v = base_hsv
|
171 |
+
|
172 |
rgb = self._hsv_to_rgb(h, s, v)
|
173 |
hex_color = self._rgb_to_hex(rgb)
|
174 |
+
|
175 |
# Store in both mappings
|
176 |
color_map['by_id'][class_id] = {
|
177 |
'rgb': rgb,
|
178 |
'hex': hex_color,
|
179 |
'category': category
|
180 |
}
|
181 |
+
|
182 |
color_map['by_name'][class_name] = {
|
183 |
'rgb': rgb,
|
184 |
'hex': hex_color,
|
185 |
'category': category
|
186 |
}
|
187 |
+
|
188 |
return color_map
|
189 |
+
|
190 |
def get_color(self, class_identifier: Union[int, str], format: str = 'hex') -> Any:
|
191 |
"""
|
192 |
Get color for a specific class
|
193 |
+
|
194 |
Args:
|
195 |
class_identifier: Class ID (int) or name (str)
|
196 |
format: Color format ('hex', 'rgb', or 'bgr')
|
197 |
+
|
198 |
Returns:
|
199 |
Color in requested format
|
200 |
"""
|
|
|
203 |
color_info = self.color_map['by_id'].get(class_identifier)
|
204 |
else:
|
205 |
color_info = self.color_map['by_name'].get(class_identifier)
|
206 |
+
|
207 |
if not color_info:
|
208 |
# Fallback color if not found
|
209 |
return '#CCCCCC' if format == 'hex' else (204, 204, 204)
|
210 |
+
|
211 |
if format == 'hex':
|
212 |
return color_info['hex']
|
213 |
elif format == 'rgb':
|
|
|
218 |
return (b, g, r)
|
219 |
else:
|
220 |
return color_info['rgb']
|
221 |
+
|
222 |
def get_all_colors(self, format: str = 'hex') -> Dict:
|
223 |
"""
|
224 |
Get all colors in the specified format
|
225 |
+
|
226 |
Args:
|
227 |
format: Color format ('hex', 'rgb', or 'bgr')
|
228 |
+
|
229 |
Returns:
|
230 |
Dictionary mapping class names to colors
|
231 |
"""
|
|
|
233 |
for class_id, class_name in self.class_names.items():
|
234 |
result[class_name] = self.get_color(class_id, format)
|
235 |
return result
|
236 |
+
|
237 |
def get_category_colors(self, format: str = 'hex') -> Dict:
|
238 |
"""
|
239 |
Get base colors for each category
|
240 |
+
|
241 |
Args:
|
242 |
format: Color format ('hex', 'rgb', or 'bgr')
|
243 |
+
|
244 |
Returns:
|
245 |
Dictionary mapping categories to colors
|
246 |
"""
|
|
|
254 |
else:
|
255 |
result[category] = color_info['rgb']
|
256 |
return result
|
257 |
+
|
258 |
def get_category_for_class(self, class_identifier: Union[int, str]) -> str:
|
259 |
"""
|
260 |
Get the category for a specific class
|
261 |
+
|
262 |
Args:
|
263 |
class_identifier: Class ID (int) or name (str)
|
264 |
+
|
265 |
Returns:
|
266 |
Category name
|
267 |
"""
|
confifence_templates.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
CONFIDENCE_TEMPLATES = {
|
3 |
+
"high": "{description} {details}",
|
4 |
+
"medium": "This appears to be {description} {details}",
|
5 |
+
"low": "This might be {description}, but the confidence is low. {details}"
|
6 |
+
}
|
cultural_templates.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
CULTURAL_TEMPLATES = {
|
3 |
+
"asian": {
|
4 |
+
"elements": ["character signage", "lanterns", "dense urban layout"],
|
5 |
+
"description": "The scene shows distinctive Asian cultural elements such as {elements}."
|
6 |
+
},
|
7 |
+
"european": {
|
8 |
+
"elements": ["classical architecture", "cobblestone streets", "café terraces"],
|
9 |
+
"description": "The environment has European characteristics including {elements}."
|
10 |
+
},
|
11 |
+
"middle_eastern": {
|
12 |
+
"elements": ["ornate archways", "geometric patterns", "domed structures"],
|
13 |
+
"description": "The scene contains Middle Eastern architectural features such as {elements}."
|
14 |
+
},
|
15 |
+
"north_american": {
|
16 |
+
"elements": ["grid street pattern", "modern skyscrapers", "wide boulevards"],
|
17 |
+
"description": "The layout shows typical North American urban design with {elements}."
|
18 |
+
}
|
19 |
+
}
|
detection_model.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6 |
|
7 |
class DetectionModel:
|
8 |
"""Core detection model class for object detection using YOLOv8"""
|
9 |
-
|
10 |
# Model information dictionary
|
11 |
MODEL_INFO = {
|
12 |
"yolov8n.pt": {
|
@@ -28,11 +28,11 @@ class DetectionModel:
|
|
28 |
"inference_speed": "Slower"
|
29 |
}
|
30 |
}
|
31 |
-
|
32 |
-
def __init__(self, model_name: str = 'yolov8m.pt', confidence: float = 0.25, iou: float = 0.
|
33 |
"""
|
34 |
Initialize the detection model
|
35 |
-
|
36 |
Args:
|
37 |
model_name: Model name or path, default is yolov8m.pt
|
38 |
confidence: Confidence threshold, default is 0.25
|
@@ -44,10 +44,10 @@ class DetectionModel:
|
|
44 |
self.model = None
|
45 |
self.class_names = {}
|
46 |
self.is_model_loaded = False
|
47 |
-
|
48 |
# Load model on initialization
|
49 |
self._load_model()
|
50 |
-
|
51 |
def _load_model(self):
|
52 |
"""Load the YOLO model"""
|
53 |
try:
|
@@ -60,57 +60,57 @@ class DetectionModel:
|
|
60 |
except Exception as e:
|
61 |
print(f"Error occurred when loading the model: {e}")
|
62 |
self.is_model_loaded = False
|
63 |
-
|
64 |
def change_model(self, new_model_name: str) -> bool:
|
65 |
"""
|
66 |
Change the currently loaded model
|
67 |
-
|
68 |
Args:
|
69 |
new_model_name: Name of the new model to load
|
70 |
-
|
71 |
Returns:
|
72 |
bool: True if model changed successfully, False otherwise
|
73 |
"""
|
74 |
if self.model_name == new_model_name and self.is_model_loaded:
|
75 |
print(f"Model {new_model_name} is already loaded")
|
76 |
return True
|
77 |
-
|
78 |
print(f"Changing model from {self.model_name} to {new_model_name}")
|
79 |
-
|
80 |
# Unload current model to free memory
|
81 |
if self.model is not None:
|
82 |
del self.model
|
83 |
self.model = None
|
84 |
-
|
85 |
# Clean GPU memory if available
|
86 |
if torch.cuda.is_available():
|
87 |
torch.cuda.empty_cache()
|
88 |
-
|
89 |
# Update model name and load new model
|
90 |
self.model_name = new_model_name
|
91 |
self._load_model()
|
92 |
-
|
93 |
return self.is_model_loaded
|
94 |
-
|
95 |
def reload_model(self):
|
96 |
"""Reload the model (useful for changing model or after error)"""
|
97 |
if self.model is not None:
|
98 |
del self.model
|
99 |
self.model = None
|
100 |
-
|
101 |
# Clean GPU memory if available
|
102 |
if torch.cuda.is_available():
|
103 |
torch.cuda.empty_cache()
|
104 |
-
|
105 |
self._load_model()
|
106 |
-
|
107 |
def detect(self, image_input: Any) -> Optional[Any]:
|
108 |
"""
|
109 |
Perform object detection on a single image
|
110 |
-
|
111 |
Args:
|
112 |
image_input: Image path (str), PIL Image, or numpy array
|
113 |
-
|
114 |
Returns:
|
115 |
Detection result object or None if error occurred
|
116 |
"""
|
@@ -120,27 +120,27 @@ class DetectionModel:
|
|
120 |
if self.model is None or not self.is_model_loaded:
|
121 |
print("Failed to load model. Cannot perform detection.")
|
122 |
return None
|
123 |
-
|
124 |
try:
|
125 |
results = self.model(image_input, conf=self.confidence, iou=self.iou)
|
126 |
return results[0]
|
127 |
except Exception as e:
|
128 |
print(f"Error occurred during detection: {e}")
|
129 |
return None
|
130 |
-
|
131 |
def get_class_names(self, class_id: int) -> str:
|
132 |
"""Get class name for a given class ID"""
|
133 |
return self.class_names.get(class_id, "Unknown Class")
|
134 |
-
|
135 |
def get_supported_classes(self) -> Dict[int, str]:
|
136 |
"""Get all supported classes as a dictionary of {id: class_name}"""
|
137 |
return self.class_names
|
138 |
-
|
139 |
@classmethod
|
140 |
def get_available_models(cls) -> List[Dict]:
|
141 |
"""
|
142 |
Get list of available models with their information
|
143 |
-
|
144 |
Returns:
|
145 |
List of dictionaries containing model information
|
146 |
"""
|
@@ -154,7 +154,7 @@ class DetectionModel:
|
|
154 |
"inference_speed": info["inference_speed"]
|
155 |
})
|
156 |
return models
|
157 |
-
|
158 |
@classmethod
|
159 |
def get_model_description(cls, model_name: str) -> str:
|
160 |
"""Get description for a specific model"""
|
|
|
6 |
|
7 |
class DetectionModel:
|
8 |
"""Core detection model class for object detection using YOLOv8"""
|
9 |
+
|
10 |
# Model information dictionary
|
11 |
MODEL_INFO = {
|
12 |
"yolov8n.pt": {
|
|
|
28 |
"inference_speed": "Slower"
|
29 |
}
|
30 |
}
|
31 |
+
|
32 |
+
def __init__(self, model_name: str = 'yolov8m.pt', confidence: float = 0.25, iou: float = 0.25):
|
33 |
"""
|
34 |
Initialize the detection model
|
35 |
+
|
36 |
Args:
|
37 |
model_name: Model name or path, default is yolov8m.pt
|
38 |
confidence: Confidence threshold, default is 0.25
|
|
|
44 |
self.model = None
|
45 |
self.class_names = {}
|
46 |
self.is_model_loaded = False
|
47 |
+
|
48 |
# Load model on initialization
|
49 |
self._load_model()
|
50 |
+
|
51 |
def _load_model(self):
|
52 |
"""Load the YOLO model"""
|
53 |
try:
|
|
|
60 |
except Exception as e:
|
61 |
print(f"Error occurred when loading the model: {e}")
|
62 |
self.is_model_loaded = False
|
63 |
+
|
64 |
def change_model(self, new_model_name: str) -> bool:
|
65 |
"""
|
66 |
Change the currently loaded model
|
67 |
+
|
68 |
Args:
|
69 |
new_model_name: Name of the new model to load
|
70 |
+
|
71 |
Returns:
|
72 |
bool: True if model changed successfully, False otherwise
|
73 |
"""
|
74 |
if self.model_name == new_model_name and self.is_model_loaded:
|
75 |
print(f"Model {new_model_name} is already loaded")
|
76 |
return True
|
77 |
+
|
78 |
print(f"Changing model from {self.model_name} to {new_model_name}")
|
79 |
+
|
80 |
# Unload current model to free memory
|
81 |
if self.model is not None:
|
82 |
del self.model
|
83 |
self.model = None
|
84 |
+
|
85 |
# Clean GPU memory if available
|
86 |
if torch.cuda.is_available():
|
87 |
torch.cuda.empty_cache()
|
88 |
+
|
89 |
# Update model name and load new model
|
90 |
self.model_name = new_model_name
|
91 |
self._load_model()
|
92 |
+
|
93 |
return self.is_model_loaded
|
94 |
+
|
95 |
def reload_model(self):
|
96 |
"""Reload the model (useful for changing model or after error)"""
|
97 |
if self.model is not None:
|
98 |
del self.model
|
99 |
self.model = None
|
100 |
+
|
101 |
# Clean GPU memory if available
|
102 |
if torch.cuda.is_available():
|
103 |
torch.cuda.empty_cache()
|
104 |
+
|
105 |
self._load_model()
|
106 |
+
|
107 |
def detect(self, image_input: Any) -> Optional[Any]:
|
108 |
"""
|
109 |
Perform object detection on a single image
|
110 |
+
|
111 |
Args:
|
112 |
image_input: Image path (str), PIL Image, or numpy array
|
113 |
+
|
114 |
Returns:
|
115 |
Detection result object or None if error occurred
|
116 |
"""
|
|
|
120 |
if self.model is None or not self.is_model_loaded:
|
121 |
print("Failed to load model. Cannot perform detection.")
|
122 |
return None
|
123 |
+
|
124 |
try:
|
125 |
results = self.model(image_input, conf=self.confidence, iou=self.iou)
|
126 |
return results[0]
|
127 |
except Exception as e:
|
128 |
print(f"Error occurred during detection: {e}")
|
129 |
return None
|
130 |
+
|
131 |
def get_class_names(self, class_id: int) -> str:
|
132 |
"""Get class name for a given class ID"""
|
133 |
return self.class_names.get(class_id, "Unknown Class")
|
134 |
+
|
135 |
def get_supported_classes(self) -> Dict[int, str]:
|
136 |
"""Get all supported classes as a dictionary of {id: class_name}"""
|
137 |
return self.class_names
|
138 |
+
|
139 |
@classmethod
|
140 |
def get_available_models(cls) -> List[Dict]:
|
141 |
"""
|
142 |
Get list of available models with their information
|
143 |
+
|
144 |
Returns:
|
145 |
List of dictionaries containing model information
|
146 |
"""
|
|
|
154 |
"inference_speed": info["inference_speed"]
|
155 |
})
|
156 |
return models
|
157 |
+
|
158 |
@classmethod
|
159 |
def get_model_description(cls, model_name: str) -> str:
|
160 |
"""Get description for a specific model"""
|
enhance_scene_describer.py
ADDED
@@ -0,0 +1,1314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
import random
|
5 |
+
import numpy as np
|
6 |
+
from typing import Dict, List, Tuple, Any, Optional
|
7 |
+
|
8 |
+
from scene_type import SCENE_TYPES
|
9 |
+
from scene_detail_templates import SCENE_DETAIL_TEMPLATES
|
10 |
+
from object_template_fillers import OBJECT_TEMPLATE_FILLERS
|
11 |
+
from lighting_conditions import LIGHTING_CONDITIONS
|
12 |
+
from viewpoint_templates import VIEWPOINT_TEMPLATES
|
13 |
+
from cultural_templates import CULTURAL_TEMPLATES
|
14 |
+
from confifence_templates import CONFIDENCE_TEMPLATES
|
15 |
+
|
16 |
+
class EnhancedSceneDescriber:
|
17 |
+
"""
|
18 |
+
Enhanced scene description generator with improved template handling,
|
19 |
+
viewpoint awareness, and cultural context recognition.
|
20 |
+
Provides detailed natural language descriptions of scenes based on
|
21 |
+
detection results and scene classification.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self, templates_db: Optional[Dict] = None, scene_types: Optional[Dict] = None):
|
25 |
+
"""
|
26 |
+
Initialize the enhanced scene describer.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
templates_db: Optional custom templates database
|
30 |
+
scene_types: Dictionary of scene type definitions
|
31 |
+
"""
|
32 |
+
# Load or use provided scene types
|
33 |
+
self.scene_types = scene_types or self._load_default_scene_types()
|
34 |
+
|
35 |
+
# Load templates database
|
36 |
+
self.templates = templates_db or self._load_templates()
|
37 |
+
|
38 |
+
# Initialize viewpoint detection parameters
|
39 |
+
self._initialize_viewpoint_parameters()
|
40 |
+
|
41 |
+
def _load_default_scene_types(self) -> Dict:
|
42 |
+
"""
|
43 |
+
Load default scene types.
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
Dict: Scene type definitions
|
47 |
+
"""
|
48 |
+
|
49 |
+
return SCENE_TYPES
|
50 |
+
|
51 |
+
def _load_templates(self) -> Dict:
|
52 |
+
"""
|
53 |
+
Load description templates from imported Python modules.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
Dict: Template collections for different description components
|
57 |
+
"""
|
58 |
+
templates = {}
|
59 |
+
|
60 |
+
# 直接從導入的 Python 模組中獲取模板
|
61 |
+
templates["scene_detail_templates"] = SCENE_DETAIL_TEMPLATES
|
62 |
+
templates["object_template_fillers"] = OBJECT_TEMPLATE_FILLERS
|
63 |
+
templates["viewpoint_templates"] = VIEWPOINT_TEMPLATES
|
64 |
+
templates["cultural_templates"] = CULTURAL_TEMPLATES
|
65 |
+
|
66 |
+
# 從 LIGHTING_CONDITIONS 獲取照明模板
|
67 |
+
templates["lighting_templates"] = {
|
68 |
+
key: data["general"] for key, data in LIGHTING_CONDITIONS.get("time_descriptions", {}).items()
|
69 |
+
}
|
70 |
+
|
71 |
+
# 設置默認的置信度模板
|
72 |
+
templates["confidence_templates"] = {
|
73 |
+
"high": "{description} {details}",
|
74 |
+
"medium": "This appears to be {description} {details}",
|
75 |
+
"low": "This might be {description}, but the confidence is low. {details}"
|
76 |
+
}
|
77 |
+
|
78 |
+
# 初始化其他必要的模板(現在這個函數簡化了很多)
|
79 |
+
self._initialize_default_templates(templates)
|
80 |
+
|
81 |
+
return templates
|
82 |
+
|
83 |
+
def _initialize_default_templates(self, templates: Dict):
|
84 |
+
"""
|
85 |
+
檢查模板字典並填充任何缺失的默認模板。
|
86 |
+
|
87 |
+
在將模板移至專門的模組後,此方法主要作為安全機制,
|
88 |
+
確保即使導入失敗或某些模板未在外部定義,系統仍能正常運行。
|
89 |
+
|
90 |
+
Args:
|
91 |
+
templates: 要檢查和更新的模板字典
|
92 |
+
"""
|
93 |
+
# 檢查關鍵模板類型是否存在,如果不存在則添加默認值
|
94 |
+
|
95 |
+
# 置信度模板 - 用於控制描述的語氣
|
96 |
+
if "confidence_templates" not in templates:
|
97 |
+
templates["confidence_templates"] = {
|
98 |
+
"high": "{description} {details}",
|
99 |
+
"medium": "This appears to be {description} {details}",
|
100 |
+
"low": "This might be {description}, but the confidence is low. {details}"
|
101 |
+
}
|
102 |
+
|
103 |
+
# 場景細節模板 - 如果未從外部導入
|
104 |
+
if "scene_detail_templates" not in templates:
|
105 |
+
templates["scene_detail_templates"] = {
|
106 |
+
"default": ["A space with various objects."]
|
107 |
+
}
|
108 |
+
|
109 |
+
# 物體填充模板 - 用於生成物體描述
|
110 |
+
if "object_template_fillers" not in templates:
|
111 |
+
templates["object_template_fillers"] = {
|
112 |
+
"default": ["various items"]
|
113 |
+
}
|
114 |
+
|
115 |
+
# 視角模板 - 雖然我們現在從專門模組導入,但作為備份
|
116 |
+
if "viewpoint_templates" not in templates:
|
117 |
+
# 使用簡化版的默認視角模板
|
118 |
+
templates["viewpoint_templates"] = {
|
119 |
+
"eye_level": {
|
120 |
+
"prefix": "From eye level, ",
|
121 |
+
"observation": "the scene is viewed straight on."
|
122 |
+
},
|
123 |
+
"aerial": {
|
124 |
+
"prefix": "From above, ",
|
125 |
+
"observation": "the scene is viewed from a bird's-eye perspective."
|
126 |
+
}
|
127 |
+
}
|
128 |
+
|
129 |
+
# 文化模板
|
130 |
+
if "cultural_templates" not in templates:
|
131 |
+
templates["cultural_templates"] = {
|
132 |
+
"asian": {
|
133 |
+
"elements": ["cultural elements"],
|
134 |
+
"description": "The scene has Asian characteristics."
|
135 |
+
},
|
136 |
+
"european": {
|
137 |
+
"elements": ["architectural features"],
|
138 |
+
"description": "The scene has European characteristics."
|
139 |
+
}
|
140 |
+
}
|
141 |
+
|
142 |
+
# 照明模板 - 用於描述光照條件
|
143 |
+
if "lighting_templates" not in templates:
|
144 |
+
templates["lighting_templates"] = {
|
145 |
+
"day_clear": "The scene is captured during daylight.",
|
146 |
+
"night": "The scene is captured at night.",
|
147 |
+
"unknown": "The lighting conditions are not easily determined."
|
148 |
+
}
|
149 |
+
|
150 |
+
def _initialize_viewpoint_parameters(self):
|
151 |
+
"""
|
152 |
+
Initialize parameters used for viewpoint detection.
|
153 |
+
"""
|
154 |
+
self.viewpoint_params = {
|
155 |
+
# Parameters for detecting aerial views
|
156 |
+
"aerial_threshold": 0.7, # High object density viewed from top
|
157 |
+
"aerial_size_variance_threshold": 0.15, # Low size variance in aerial views
|
158 |
+
|
159 |
+
# Parameters for detecting low angle views
|
160 |
+
"low_angle_threshold": 0.3, # Bottom-heavy object distribution
|
161 |
+
"vertical_size_ratio_threshold": 1.8, # Vertical objects appear taller
|
162 |
+
|
163 |
+
# Parameters for detecting elevated views
|
164 |
+
"elevated_threshold": 0.6, # Objects mostly in middle/bottom
|
165 |
+
"elevated_top_threshold": 0.3 # Few objects at top of frame
|
166 |
+
}
|
167 |
+
|
168 |
+
|
169 |
+
def generate_description(self,
|
170 |
+
scene_type: str,
|
171 |
+
detected_objects: List[Dict],
|
172 |
+
confidence: float,
|
173 |
+
lighting_info: Optional[Dict] = None,
|
174 |
+
functional_zones: Optional[Dict] = None) -> str:
|
175 |
+
"""
|
176 |
+
Generate enhanced scene description based on detection results, scene type,
|
177 |
+
and additional contextual information.
|
178 |
+
|
179 |
+
This is the main entry point that replaces the original _generate_scene_description.
|
180 |
+
|
181 |
+
Args:
|
182 |
+
scene_type: Identified scene type
|
183 |
+
detected_objects: List of detected objects
|
184 |
+
confidence: Scene classification confidence
|
185 |
+
lighting_info: Optional lighting condition information
|
186 |
+
functional_zones: Optional identified functional zones
|
187 |
+
|
188 |
+
Returns:
|
189 |
+
str: Natural language description of the scene
|
190 |
+
"""
|
191 |
+
# Handle unknown scene type or very low confidence
|
192 |
+
if scene_type == "unknown" or confidence < 0.4:
|
193 |
+
return self._generate_generic_description(detected_objects, lighting_info)
|
194 |
+
|
195 |
+
# Detect viewpoint
|
196 |
+
viewpoint = self._detect_viewpoint(detected_objects)
|
197 |
+
|
198 |
+
if viewpoint == "aerial":
|
199 |
+
# 如果是十字路口相關的場景,確保使用正確的空中視角十字路口場景類型
|
200 |
+
if "intersection" in scene_type or self._is_intersection(detected_objects):
|
201 |
+
scene_type = "aerial_view_intersection"
|
202 |
+
# 如果是商業區相關的場景
|
203 |
+
elif any(keyword in scene_type for keyword in ["commercial", "shopping", "retail"]):
|
204 |
+
scene_type = "aerial_view_commercial_area"
|
205 |
+
# 如果是廣場相關的場景
|
206 |
+
elif any(keyword in scene_type for keyword in ["plaza", "square"]):
|
207 |
+
scene_type = "aerial_view_plaza"
|
208 |
+
# 其他空中視角場景,預設使用十字路口
|
209 |
+
else:
|
210 |
+
scene_type = "aerial_view_intersection"
|
211 |
+
|
212 |
+
# Detect cultural context - 只有在非空中視角時才檢測文化上下文
|
213 |
+
cultural_context = None
|
214 |
+
if viewpoint != "aerial":
|
215 |
+
cultural_context = self._detect_cultural_context(scene_type, detected_objects)
|
216 |
+
|
217 |
+
# Select appropriate template based on confidence
|
218 |
+
if confidence > 0.75:
|
219 |
+
confidence_level = "high"
|
220 |
+
elif confidence > 0.5:
|
221 |
+
confidence_level = "medium"
|
222 |
+
else:
|
223 |
+
confidence_level = "low"
|
224 |
+
|
225 |
+
# Get base description for the scene type
|
226 |
+
if viewpoint == "aerial":
|
227 |
+
# 空中視角時使用已設定的基本描述
|
228 |
+
if 'base_description' not in locals():
|
229 |
+
base_description = "An aerial view showing the layout and movement patterns from above"
|
230 |
+
elif scene_type in self.scene_types:
|
231 |
+
base_description = self.scene_types[scene_type].get("description", "A scene")
|
232 |
+
else:
|
233 |
+
base_description = "A scene"
|
234 |
+
|
235 |
+
# Generate detailed scene information
|
236 |
+
scene_details = self._generate_scene_details(
|
237 |
+
scene_type,
|
238 |
+
detected_objects,
|
239 |
+
lighting_info,
|
240 |
+
viewpoint
|
241 |
+
)
|
242 |
+
|
243 |
+
# 修正:根據人數改進描述
|
244 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
|
245 |
+
if people_objs:
|
246 |
+
people_count = len(people_objs)
|
247 |
+
if people_count > 5:
|
248 |
+
# 當人數很多��,用更精確的措辭
|
249 |
+
people_phrase = f"numerous people ({people_count})"
|
250 |
+
else:
|
251 |
+
people_phrase = f"{people_count} {'people' if people_count > 1 else 'person'}"
|
252 |
+
|
253 |
+
# 將人數信息加入到場景詳情中
|
254 |
+
if "people" not in scene_details.lower() and "pedestrian" not in scene_details.lower():
|
255 |
+
scene_details += f" The scene includes {people_phrase}."
|
256 |
+
|
257 |
+
# Apply cultural context if detected (只在非空中視角時應用)
|
258 |
+
if cultural_context and scene_details and viewpoint != "aerial":
|
259 |
+
cultural_elements = self._generate_cultural_elements(cultural_context)
|
260 |
+
if cultural_elements:
|
261 |
+
scene_details += f" {cultural_elements}"
|
262 |
+
|
263 |
+
# Include lighting information if available
|
264 |
+
lighting_description = ""
|
265 |
+
if lighting_info and "time_of_day" in lighting_info:
|
266 |
+
lighting_type = lighting_info["time_of_day"]
|
267 |
+
if lighting_type in self.templates.get("lighting_templates", {}):
|
268 |
+
lighting_description = self.templates["lighting_templates"][lighting_type]
|
269 |
+
|
270 |
+
# Apply confidence template
|
271 |
+
description_template = self.templates["confidence_templates"].get(
|
272 |
+
confidence_level, "{description} {details}"
|
273 |
+
)
|
274 |
+
|
275 |
+
# Fill the template
|
276 |
+
description = description_template.format(
|
277 |
+
description=base_description,
|
278 |
+
details=scene_details
|
279 |
+
)
|
280 |
+
|
281 |
+
# Add viewpoint observation if viewpoint is not standard
|
282 |
+
if viewpoint != "eye_level" and viewpoint in self.templates.get("viewpoint_templates", {}):
|
283 |
+
viewpoint_template = self.templates["viewpoint_templates"][viewpoint]
|
284 |
+
|
285 |
+
# 在空中視角時,確保觀察描述反映更多細節
|
286 |
+
if viewpoint == "aerial":
|
287 |
+
scene_elements = "the crossing patterns and pedestrian movement"
|
288 |
+
else:
|
289 |
+
scene_elements = "objects and layout"
|
290 |
+
|
291 |
+
viewpoint_desc = viewpoint_template.get("observation", "").format(
|
292 |
+
scene_elements=scene_elements
|
293 |
+
)
|
294 |
+
|
295 |
+
# Add viewpoint prefix if needed
|
296 |
+
if not description.startswith(viewpoint_template.get("prefix", "")):
|
297 |
+
description = f"{viewpoint_template.get('prefix', '')}{description}"
|
298 |
+
|
299 |
+
# Add viewpoint observation if not already included
|
300 |
+
if viewpoint_desc not in description:
|
301 |
+
description += f" {viewpoint_desc}"
|
302 |
+
|
303 |
+
# Add lighting description if available
|
304 |
+
if lighting_description and lighting_description not in description:
|
305 |
+
description += f" {lighting_description}"
|
306 |
+
|
307 |
+
# Add information about functional zones if available
|
308 |
+
if functional_zones and len(functional_zones) > 0:
|
309 |
+
zones_desc = self._describe_functional_zones(functional_zones)
|
310 |
+
if zones_desc:
|
311 |
+
description += f" {zones_desc}"
|
312 |
+
|
313 |
+
# 計算真實的人數
|
314 |
+
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
|
315 |
+
|
316 |
+
# 檢查描述中是否有人數信息的矛盾
|
317 |
+
if people_count > 5:
|
318 |
+
# 識別可能含有較小人數信息的片段
|
319 |
+
small_people_patterns = [
|
320 |
+
r"Area with \d+ people\.",
|
321 |
+
r"Area with \d+ person\.",
|
322 |
+
r"with \d+ people",
|
323 |
+
r"with \d+ person"
|
324 |
+
]
|
325 |
+
# 對每個模式檢查並移除
|
326 |
+
filtered_description = description
|
327 |
+
for pattern in small_people_patterns:
|
328 |
+
matches = re.findall(pattern, filtered_description)
|
329 |
+
for match in matches:
|
330 |
+
# 從匹配中提取人數
|
331 |
+
number_match = re.search(r'\d+', match)
|
332 |
+
if number_match:
|
333 |
+
try:
|
334 |
+
people_mentioned = int(number_match.group())
|
335 |
+
# 如果提到的人數小於總人數,移除整個句子
|
336 |
+
if people_mentioned < people_count:
|
337 |
+
# 將描述分割成句子
|
338 |
+
sentences = re.split(r'(?<=[.!?])\s+', filtered_description)
|
339 |
+
# 移除包含匹配片段的句子
|
340 |
+
filtered_sentences = []
|
341 |
+
for sentence in sentences:
|
342 |
+
if match not in sentence:
|
343 |
+
filtered_sentences.append(sentence)
|
344 |
+
# 重新組合描述
|
345 |
+
filtered_description = " ".join(filtered_sentences)
|
346 |
+
except ValueError:
|
347 |
+
# 數字轉換失敗,繼續處理
|
348 |
+
continue
|
349 |
+
|
350 |
+
# 使用過濾後的描述
|
351 |
+
description = filtered_description
|
352 |
+
|
353 |
+
return description
|
354 |
+
|
355 |
+
def _is_intersection(self, detected_objects: List[Dict]) -> bool:
|
356 |
+
"""
|
357 |
+
通過分析物體分佈來判斷場景是否為十字路口
|
358 |
+
"""
|
359 |
+
# 檢查行人分佈模式
|
360 |
+
pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
|
361 |
+
|
362 |
+
if len(pedestrians) >= 8: # 需要足夠的行人來形成十字路口
|
363 |
+
# 抓取行人位置
|
364 |
+
positions = [obj.get("normalized_center", (0, 0)) for obj in pedestrians]
|
365 |
+
|
366 |
+
# 分析 x 和 y 坐標分佈
|
367 |
+
x_coords = [pos[0] for pos in positions]
|
368 |
+
y_coords = [pos[1] for pos in positions]
|
369 |
+
|
370 |
+
# 計算 x 和 y 坐標的變異數
|
371 |
+
x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
|
372 |
+
y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
|
373 |
+
|
374 |
+
# 計算範圍
|
375 |
+
x_range = max(x_coords) - min(x_coords)
|
376 |
+
y_range = max(y_coords) - min(y_coords)
|
377 |
+
|
378 |
+
# 如果 x 和 y 方向都有較大範圍且範圍相似,那就有可能是十字路口
|
379 |
+
if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
|
380 |
+
return True
|
381 |
+
|
382 |
+
return False
|
383 |
+
|
384 |
+
def _generate_generic_description(self, detected_objects: List[Dict], lighting_info: Optional[Dict] = None) -> str:
|
385 |
+
"""
|
386 |
+
Generate a generic description when scene type is unknown or confidence is very low.
|
387 |
+
|
388 |
+
Args:
|
389 |
+
detected_objects: List of detected objects
|
390 |
+
lighting_info: Optional lighting condition information
|
391 |
+
|
392 |
+
Returns:
|
393 |
+
str: Generic description based on detected objects
|
394 |
+
"""
|
395 |
+
# Count object occurrences
|
396 |
+
obj_counts = {}
|
397 |
+
for obj in detected_objects:
|
398 |
+
class_name = obj["class_name"]
|
399 |
+
if class_name not in obj_counts:
|
400 |
+
obj_counts[class_name] = 0
|
401 |
+
obj_counts[class_name] += 1
|
402 |
+
|
403 |
+
# Get top objects by count
|
404 |
+
top_objects = sorted(obj_counts.items(), key=lambda x: x[1], reverse=True)[:5]
|
405 |
+
|
406 |
+
if not top_objects:
|
407 |
+
base_desc = "No clearly identifiable objects are visible in this scene."
|
408 |
+
else:
|
409 |
+
# Format object list
|
410 |
+
objects_text = []
|
411 |
+
for name, count in top_objects:
|
412 |
+
if count > 1:
|
413 |
+
objects_text.append(f"{count} {name}s")
|
414 |
+
else:
|
415 |
+
objects_text.append(name)
|
416 |
+
|
417 |
+
if len(objects_text) == 1:
|
418 |
+
objects_list = objects_text[0]
|
419 |
+
elif len(objects_text) == 2:
|
420 |
+
objects_list = f"{objects_text[0]} and {objects_text[1]}"
|
421 |
+
else:
|
422 |
+
objects_list = ", ".join(objects_text[:-1]) + f", and {objects_text[-1]}"
|
423 |
+
|
424 |
+
base_desc = f"This scene contains {objects_list}."
|
425 |
+
|
426 |
+
# Add lighting information if available
|
427 |
+
if lighting_info and "time_of_day" in lighting_info:
|
428 |
+
lighting_type = lighting_info["time_of_day"]
|
429 |
+
if lighting_type in self.templates.get("lighting_templates", {}):
|
430 |
+
lighting_desc = self.templates["lighting_templates"][lighting_type]
|
431 |
+
base_desc += f" {lighting_desc}"
|
432 |
+
|
433 |
+
return base_desc
|
434 |
+
|
435 |
+
def _generate_scene_details(self,
|
436 |
+
scene_type: str,
|
437 |
+
detected_objects: List[Dict],
|
438 |
+
lighting_info: Optional[Dict] = None,
|
439 |
+
viewpoint: str = "eye_level") -> str:
|
440 |
+
"""
|
441 |
+
Generate detailed description based on scene type and detected objects.
|
442 |
+
|
443 |
+
Args:
|
444 |
+
scene_type: Identified scene type
|
445 |
+
detected_objects: List of detected objects
|
446 |
+
lighting_info: Optional lighting condition information
|
447 |
+
viewpoint: Detected viewpoint (aerial, eye_level, etc.)
|
448 |
+
|
449 |
+
Returns:
|
450 |
+
str: Detailed scene description
|
451 |
+
"""
|
452 |
+
# Get scene-specific templates
|
453 |
+
scene_details = ""
|
454 |
+
scene_templates = self.templates.get("scene_detail_templates", {})
|
455 |
+
|
456 |
+
# Handle specific scene types
|
457 |
+
if scene_type in scene_templates:
|
458 |
+
# Select a template appropriate for the viewpoint if available
|
459 |
+
viewpoint_key = f"{scene_type}_{viewpoint}"
|
460 |
+
|
461 |
+
if viewpoint_key in scene_templates:
|
462 |
+
# We have a viewpoint-specific template
|
463 |
+
templates_list = scene_templates[viewpoint_key]
|
464 |
+
else:
|
465 |
+
# Fall back to general templates for this scene type
|
466 |
+
templates_list = scene_templates[scene_type]
|
467 |
+
|
468 |
+
# Select a random template from the list
|
469 |
+
if templates_list:
|
470 |
+
detail_template = random.choice(templates_list)
|
471 |
+
|
472 |
+
# Fill the template with object information
|
473 |
+
scene_details = self._fill_detail_template(
|
474 |
+
detail_template,
|
475 |
+
detected_objects,
|
476 |
+
scene_type
|
477 |
+
)
|
478 |
+
else:
|
479 |
+
# Use default templates if specific ones aren't available
|
480 |
+
if "default" in scene_templates:
|
481 |
+
detail_template = random.choice(scene_templates["default"])
|
482 |
+
scene_details = self._fill_detail_template(
|
483 |
+
detail_template,
|
484 |
+
detected_objects,
|
485 |
+
"default"
|
486 |
+
)
|
487 |
+
else:
|
488 |
+
# Fall back to basic description if no templates are available
|
489 |
+
scene_details = self._generate_basic_details(scene_type, detected_objects)
|
490 |
+
|
491 |
+
return scene_details
|
492 |
+
|
493 |
+
def _fill_detail_template(self, template: str, detected_objects: List[Dict], scene_type: str) -> str:
|
494 |
+
"""
|
495 |
+
Fill a template with specific details based on detected objects.
|
496 |
+
|
497 |
+
Args:
|
498 |
+
template: Template string with placeholders
|
499 |
+
detected_objects: List of detected objects
|
500 |
+
scene_type: Identified scene type
|
501 |
+
|
502 |
+
Returns:
|
503 |
+
str: Filled template
|
504 |
+
"""
|
505 |
+
# Find placeholders in the template using simple {placeholder} syntax
|
506 |
+
import re
|
507 |
+
placeholders = re.findall(r'\{([^}]+)\}', template)
|
508 |
+
|
509 |
+
filled_template = template
|
510 |
+
|
511 |
+
# Get object template fillers
|
512 |
+
fillers = self.templates.get("object_template_fillers", {})
|
513 |
+
|
514 |
+
# 為所有可能的變數設置默認值
|
515 |
+
default_replacements = {
|
516 |
+
# 室內相關
|
517 |
+
"furniture": "various furniture pieces",
|
518 |
+
"seating": "comfortable seating",
|
519 |
+
"electronics": "entertainment devices",
|
520 |
+
"bed_type": "a bed",
|
521 |
+
"bed_location": "room",
|
522 |
+
"bed_description": "sleeping arrangements",
|
523 |
+
"extras": "personal items",
|
524 |
+
"table_setup": "a dining table and chairs",
|
525 |
+
"table_description": "a dining surface",
|
526 |
+
"dining_items": "dining furniture and tableware",
|
527 |
+
"appliances": "kitchen appliances",
|
528 |
+
"kitchen_items": "cooking utensils and dishware",
|
529 |
+
"cooking_equipment": "cooking equipment",
|
530 |
+
"office_equipment": "work-related furniture and devices",
|
531 |
+
"desk_setup": "a desk and chair",
|
532 |
+
"computer_equipment": "electronic devices",
|
533 |
+
|
534 |
+
# 室外/城市相關
|
535 |
+
"traffic_description": "vehicles and pedestrians",
|
536 |
+
"people_and_vehicles": "people and various vehicles",
|
537 |
+
"street_elements": "urban infrastructure",
|
538 |
+
"park_features": "benches and greenery",
|
539 |
+
"outdoor_elements": "natural features",
|
540 |
+
"park_description": "outdoor amenities",
|
541 |
+
"store_elements": "merchandise displays",
|
542 |
+
"shopping_activity": "customers browse and shop",
|
543 |
+
"store_items": "products for sale",
|
544 |
+
|
545 |
+
# 高級餐廳相關
|
546 |
+
"design_elements": "elegant decor",
|
547 |
+
"lighting": "stylish lighting fixtures",
|
548 |
+
|
549 |
+
# 亞洲商業街相關
|
550 |
+
"storefront_features": "compact shops",
|
551 |
+
"pedestrian_flow": "people walking",
|
552 |
+
"asian_elements": "distinctive cultural elements",
|
553 |
+
"cultural_elements": "traditional design features",
|
554 |
+
"signage": "colorful signs",
|
555 |
+
"street_activities": "busy urban activity",
|
556 |
+
|
557 |
+
# 金融區相關
|
558 |
+
"buildings": "tall buildings",
|
559 |
+
"traffic_elements": "vehicles",
|
560 |
+
"skyscrapers": "high-rise buildings",
|
561 |
+
"road_features": "wide streets",
|
562 |
+
"architectural_elements": "modern architecture",
|
563 |
+
"city_landmarks": "prominent structures",
|
564 |
+
|
565 |
+
# 十字路口相關
|
566 |
+
"crossing_pattern": "marked pedestrian crossings",
|
567 |
+
"pedestrian_behavior": "careful walking",
|
568 |
+
"pedestrian_density": "groups of pedestrians",
|
569 |
+
"traffic_pattern": "regulated traffic flow",
|
570 |
+
|
571 |
+
# 交通樞紐相關
|
572 |
+
"transit_vehicles": "public transportation vehicles",
|
573 |
+
"passenger_activity": "commuter movement",
|
574 |
+
"transportation_modes": "various transit options",
|
575 |
+
"passenger_needs": "waiting areas",
|
576 |
+
"transit_infrastructure": "transit facilities",
|
577 |
+
"passenger_movement": "commuter flow",
|
578 |
+
|
579 |
+
# 購物區相關
|
580 |
+
"retail_elements": "shops and displays",
|
581 |
+
"store_types": "various retail establishments",
|
582 |
+
"walkway_features": "pedestrian pathways",
|
583 |
+
"commercial_signage": "store signs",
|
584 |
+
"consumer_behavior": "shopping activities",
|
585 |
+
|
586 |
+
# 空中視角相關
|
587 |
+
"commercial_layout": "organized retail areas",
|
588 |
+
"pedestrian_pattern": "people movement patterns",
|
589 |
+
"gathering_features": "public gathering spaces",
|
590 |
+
"movement_pattern": "crowd flow patterns",
|
591 |
+
"urban_elements": "city infrastructure",
|
592 |
+
"public_activity": "social interaction",
|
593 |
+
|
594 |
+
# 文化特定元素
|
595 |
+
"stall_elements": "vendor booths",
|
596 |
+
"lighting_features": "decorative lights",
|
597 |
+
"food_elements": "food offerings",
|
598 |
+
"vendor_stalls": "market stalls",
|
599 |
+
"nighttime_activity": "evening commerce",
|
600 |
+
"cultural_lighting": "traditional lighting",
|
601 |
+
"night_market_sounds": "lively market sounds",
|
602 |
+
"evening_crowd_behavior": "nighttime social activity",
|
603 |
+
"architectural_elements": "cultural buildings",
|
604 |
+
"religious_structures": "sacred buildings",
|
605 |
+
"decorative_features": "ornamental designs",
|
606 |
+
"cultural_practices": "traditional activities",
|
607 |
+
"temple_architecture": "religious structures",
|
608 |
+
"sensory_elements": "atmospheric elements",
|
609 |
+
"visitor_activities": "cultural experiences",
|
610 |
+
"ritual_activities": "ceremonial practices",
|
611 |
+
"cultural_symbols": "meaningful symbols",
|
612 |
+
"architectural_style": "historical buildings",
|
613 |
+
"historic_elements": "traditional architecture",
|
614 |
+
"urban_design": "city planning elements",
|
615 |
+
"social_behaviors": "public interactions",
|
616 |
+
"european_features": "European architectural details",
|
617 |
+
"tourist_activities": "visitor activities",
|
618 |
+
"local_customs": "regional practices",
|
619 |
+
|
620 |
+
# 時間特定元素
|
621 |
+
"lighting_effects": "artificial lighting",
|
622 |
+
"shadow_patterns": "light and shadow",
|
623 |
+
"urban_features": "city elements",
|
624 |
+
"illuminated_elements": "lit structures",
|
625 |
+
"evening_activities": "nighttime activities",
|
626 |
+
"light_sources": "lighting points",
|
627 |
+
"lit_areas": "illuminated spaces",
|
628 |
+
"shadowed_zones": "darker areas",
|
629 |
+
"illuminated_signage": "bright signs",
|
630 |
+
"colorful_lighting": "multicolored lights",
|
631 |
+
"neon_elements": "neon signs",
|
632 |
+
"night_crowd_behavior": "evening social patterns",
|
633 |
+
"light_displays": "lighting installations",
|
634 |
+
"building_features": "architectural elements",
|
635 |
+
"nightlife_activities": "evening entertainment",
|
636 |
+
"lighting_modifier": "bright",
|
637 |
+
|
638 |
+
# 混合環境元素
|
639 |
+
"transitional_elements": "connecting features",
|
640 |
+
"indoor_features": "interior elements",
|
641 |
+
"outdoor_setting": "exterior spaces",
|
642 |
+
"interior_amenities": "inside comforts",
|
643 |
+
"exterior_features": "outside elements",
|
644 |
+
"inside_elements": "interior design",
|
645 |
+
"outside_spaces": "outdoor areas",
|
646 |
+
"dual_environment_benefits": "combined settings",
|
647 |
+
"passenger_activities": "waiting behaviors",
|
648 |
+
"transportation_types": "transit vehicles",
|
649 |
+
"sheltered_elements": "covered areas",
|
650 |
+
"exposed_areas": "open sections",
|
651 |
+
"waiting_behaviors": "passenger activities",
|
652 |
+
"indoor_facilities": "inside services",
|
653 |
+
"platform_features": "transit platform elements",
|
654 |
+
"transit_routines": "transportation procedures",
|
655 |
+
|
656 |
+
# 專門場所元素
|
657 |
+
"seating_arrangement": "spectator seating",
|
658 |
+
"playing_surface": "athletic field",
|
659 |
+
"sporting_activities": "sports events",
|
660 |
+
"spectator_facilities": "viewer accommodations",
|
661 |
+
"competition_space": "sports arena",
|
662 |
+
"sports_events": "athletic competitions",
|
663 |
+
"viewing_areas": "audience sections",
|
664 |
+
"field_elements": "field markings and equipment",
|
665 |
+
"game_activities": "competitive play",
|
666 |
+
"construction_equipment": "building machinery",
|
667 |
+
"building_materials": "construction supplies",
|
668 |
+
"construction_activities": "building work",
|
669 |
+
"work_elements": "construction tools",
|
670 |
+
"structural_components": "building structures",
|
671 |
+
"site_equipment": "construction gear",
|
672 |
+
"raw_materials": "building supplies",
|
673 |
+
"construction_process": "building phases",
|
674 |
+
"medical_elements": "healthcare equipment",
|
675 |
+
"clinical_activities": "medical procedures",
|
676 |
+
"facility_design": "healthcare layout",
|
677 |
+
"healthcare_features": "medical facilities",
|
678 |
+
"patient_interactions": "care activities",
|
679 |
+
"equipment_types": "medical devices",
|
680 |
+
"care_procedures": "health services",
|
681 |
+
"treatment_spaces": "clinical areas",
|
682 |
+
"educational_furniture": "learning furniture",
|
683 |
+
"learning_activities": "educational practices",
|
684 |
+
"instructional_design": "teaching layout",
|
685 |
+
"classroom_elements": "school equipment",
|
686 |
+
"teaching_methods": "educational approaches",
|
687 |
+
"student_engagement": "learning participation",
|
688 |
+
"learning_spaces": "educational areas",
|
689 |
+
"educational_tools": "teaching resources",
|
690 |
+
"knowledge_transfer": "learning exchanges"
|
691 |
+
}
|
692 |
+
|
693 |
+
# For each placeholder, try to fill with appropriate content
|
694 |
+
for placeholder in placeholders:
|
695 |
+
if placeholder in fillers:
|
696 |
+
# Get random filler for this placeholder
|
697 |
+
options = fillers[placeholder]
|
698 |
+
if options:
|
699 |
+
# Select 1-3 items from the options list
|
700 |
+
num_items = min(len(options), random.randint(1, 3))
|
701 |
+
selected_items = random.sample(options, num_items)
|
702 |
+
|
703 |
+
# Create a formatted list
|
704 |
+
if len(selected_items) == 1:
|
705 |
+
replacement = selected_items[0]
|
706 |
+
elif len(selected_items) == 2:
|
707 |
+
replacement = f"{selected_items[0]} and {selected_items[1]}"
|
708 |
+
else:
|
709 |
+
replacement = ", ".join(selected_items[:-1]) + f", and {selected_items[-1]}"
|
710 |
+
|
711 |
+
# Replace the placeholder
|
712 |
+
filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
|
713 |
+
else:
|
714 |
+
# Try to fill with scene-specific logic
|
715 |
+
replacement = self._generate_placeholder_content(placeholder, detected_objects, scene_type)
|
716 |
+
if replacement:
|
717 |
+
filled_template = filled_template.replace(f"{{{placeholder}}}", replacement)
|
718 |
+
elif placeholder in default_replacements:
|
719 |
+
# Use default replacement if available
|
720 |
+
filled_template = filled_template.replace(f"{{{placeholder}}}", default_replacements[placeholder])
|
721 |
+
else:
|
722 |
+
# Last resort default
|
723 |
+
filled_template = filled_template.replace(f"{{{placeholder}}}", "various items")
|
724 |
+
|
725 |
+
return filled_template
|
726 |
+
|
727 |
+
def _generate_placeholder_content(self, placeholder: str, detected_objects: List[Dict], scene_type: str) -> str:
|
728 |
+
"""
|
729 |
+
Generate content for a template placeholder based on scene-specific logic.
|
730 |
+
|
731 |
+
Args:
|
732 |
+
placeholder: Template placeholder
|
733 |
+
detected_objects: List of detected objects
|
734 |
+
scene_type: Identified scene type
|
735 |
+
|
736 |
+
Returns:
|
737 |
+
str: Content for the placeholder
|
738 |
+
"""
|
739 |
+
# Handle different types of placeholders with custom logic
|
740 |
+
if placeholder == "furniture":
|
741 |
+
# Extract furniture items
|
742 |
+
furniture_ids = [56, 57, 58, 59, 60, 61] # Example furniture IDs
|
743 |
+
furniture_objects = [obj for obj in detected_objects if obj["class_id"] in furniture_ids]
|
744 |
+
|
745 |
+
if furniture_objects:
|
746 |
+
furniture_names = [obj["class_name"] for obj in furniture_objects[:3]]
|
747 |
+
return ", ".join(set(furniture_names))
|
748 |
+
return "various furniture items"
|
749 |
+
|
750 |
+
elif placeholder == "electronics":
|
751 |
+
# Extract electronic items
|
752 |
+
electronics_ids = [62, 63, 64, 65, 66, 67, 68, 69, 70] # Example electronics IDs
|
753 |
+
electronics_objects = [obj for obj in detected_objects if obj["class_id"] in electronics_ids]
|
754 |
+
|
755 |
+
if electronics_objects:
|
756 |
+
electronics_names = [obj["class_name"] for obj in electronics_objects[:3]]
|
757 |
+
return ", ".join(set(electronics_names))
|
758 |
+
return "electronic devices"
|
759 |
+
|
760 |
+
elif placeholder == "people_count":
|
761 |
+
# Count people
|
762 |
+
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
|
763 |
+
|
764 |
+
if people_count == 0:
|
765 |
+
return "no people"
|
766 |
+
elif people_count == 1:
|
767 |
+
return "one person"
|
768 |
+
elif people_count < 5:
|
769 |
+
return f"{people_count} people"
|
770 |
+
else:
|
771 |
+
return "several people"
|
772 |
+
|
773 |
+
elif placeholder == "seating":
|
774 |
+
# Extract seating items
|
775 |
+
seating_ids = [56, 57] # chair, sofa
|
776 |
+
seating_objects = [obj for obj in detected_objects if obj["class_id"] in seating_ids]
|
777 |
+
|
778 |
+
if seating_objects:
|
779 |
+
seating_names = [obj["class_name"] for obj in seating_objects[:2]]
|
780 |
+
return ", ".join(set(seating_names))
|
781 |
+
return "seating arrangements"
|
782 |
+
|
783 |
+
# Default case - empty string
|
784 |
+
return ""
|
785 |
+
|
786 |
+
def _generate_basic_details(self, scene_type: str, detected_objects: List[Dict]) -> str:
|
787 |
+
"""
|
788 |
+
Generate basic details when templates aren't available.
|
789 |
+
|
790 |
+
Args:
|
791 |
+
scene_type: Identified scene type
|
792 |
+
detected_objects: List of detected objects
|
793 |
+
|
794 |
+
Returns:
|
795 |
+
str: Basic scene details
|
796 |
+
"""
|
797 |
+
# Handle specific scene types with custom logic
|
798 |
+
if scene_type == "living_room":
|
799 |
+
tv_objs = [obj for obj in detected_objects if obj["class_id"] == 62] # TV
|
800 |
+
sofa_objs = [obj for obj in detected_objects if obj["class_id"] == 57] # Sofa
|
801 |
+
|
802 |
+
if tv_objs and sofa_objs:
|
803 |
+
tv_region = tv_objs[0]["region"]
|
804 |
+
sofa_region = sofa_objs[0]["region"]
|
805 |
+
|
806 |
+
arrangement = f"The TV is in the {tv_region.replace('_', ' ')} of the image, "
|
807 |
+
arrangement += f"while the sofa is in the {sofa_region.replace('_', ' ')}. "
|
808 |
+
|
809 |
+
return f"{arrangement}This appears to be a space designed for relaxation and entertainment."
|
810 |
+
|
811 |
+
elif scene_type == "bedroom":
|
812 |
+
bed_objs = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed
|
813 |
+
|
814 |
+
if bed_objs:
|
815 |
+
bed_region = bed_objs[0]["region"]
|
816 |
+
extra_items = []
|
817 |
+
|
818 |
+
for obj in detected_objects:
|
819 |
+
if obj["class_id"] == 74: # Clock
|
820 |
+
extra_items.append("clock")
|
821 |
+
elif obj["class_id"] == 73: # Book
|
822 |
+
extra_items.append("book")
|
823 |
+
|
824 |
+
extras = ""
|
825 |
+
if extra_items:
|
826 |
+
extras = f" There is also a {' and a '.join(extra_items)} visible."
|
827 |
+
|
828 |
+
return f"The bed is located in the {bed_region.replace('_', ' ')} of the image.{extras}"
|
829 |
+
|
830 |
+
elif scene_type in ["dining_area", "kitchen"]:
|
831 |
+
# Count food and dining-related items
|
832 |
+
food_items = []
|
833 |
+
for obj in detected_objects:
|
834 |
+
if obj["class_id"] in [39, 41, 42, 43, 44, 45]: # Kitchen items
|
835 |
+
food_items.append(obj["class_name"])
|
836 |
+
|
837 |
+
food_str = ""
|
838 |
+
if food_items:
|
839 |
+
unique_items = list(set(food_items))
|
840 |
+
if len(unique_items) <= 3:
|
841 |
+
food_str = f" with {', '.join(unique_items)}"
|
842 |
+
else:
|
843 |
+
food_str = f" with {', '.join(unique_items[:3])} and other items"
|
844 |
+
|
845 |
+
return f"{food_str}."
|
846 |
+
|
847 |
+
elif scene_type == "city_street":
|
848 |
+
# Count people and vehicles
|
849 |
+
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
|
850 |
+
vehicle_count = len([obj for obj in detected_objects
|
851 |
+
if obj["class_id"] in [1, 2, 3, 5, 7]]) # Bicycle, car, motorbike, bus, truck
|
852 |
+
|
853 |
+
traffic_desc = ""
|
854 |
+
if people_count > 0 and vehicle_count > 0:
|
855 |
+
traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'} and "
|
856 |
+
traffic_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
|
857 |
+
elif people_count > 0:
|
858 |
+
traffic_desc = f" with {people_count} {'people' if people_count > 1 else 'person'}"
|
859 |
+
elif vehicle_count > 0:
|
860 |
+
traffic_desc = f" with {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
|
861 |
+
|
862 |
+
return f"{traffic_desc}."
|
863 |
+
|
864 |
+
# Handle more specialized scenes
|
865 |
+
elif scene_type == "asian_commercial_street":
|
866 |
+
# Look for key urban elements
|
867 |
+
people_count = len([obj for obj in detected_objects if obj["class_id"] == 0])
|
868 |
+
vehicle_count = len([obj for obj in detected_objects if obj["class_id"] in [1, 2, 3]])
|
869 |
+
|
870 |
+
# Analyze pedestrian distribution
|
871 |
+
people_positions = []
|
872 |
+
for obj in detected_objects:
|
873 |
+
if obj["class_id"] == 0: # Person
|
874 |
+
people_positions.append(obj["normalized_center"])
|
875 |
+
|
876 |
+
# Check if people are distributed along a line (indicating a walking path)
|
877 |
+
structured_path = False
|
878 |
+
if len(people_positions) >= 3:
|
879 |
+
# Simplified check - see if y-coordinates are similar for multiple people
|
880 |
+
y_coords = [pos[1] for pos in people_positions]
|
881 |
+
y_mean = sum(y_coords) / len(y_coords)
|
882 |
+
y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
|
883 |
+
if y_variance < 0.05: # Low variance indicates linear arrangement
|
884 |
+
structured_path = True
|
885 |
+
|
886 |
+
street_desc = "A commercial street with "
|
887 |
+
if people_count > 0:
|
888 |
+
street_desc += f"{people_count} {'pedestrians' if people_count > 1 else 'pedestrian'}"
|
889 |
+
if vehicle_count > 0:
|
890 |
+
street_desc += f" and {vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
|
891 |
+
elif vehicle_count > 0:
|
892 |
+
street_desc += f"{vehicle_count} {'vehicles' if vehicle_count > 1 else 'vehicle'}"
|
893 |
+
else:
|
894 |
+
street_desc += "various commercial elements"
|
895 |
+
|
896 |
+
if structured_path:
|
897 |
+
street_desc += ". The pedestrians appear to be following a defined walking path"
|
898 |
+
|
899 |
+
# Add cultural elements
|
900 |
+
street_desc += ". The signage and architectural elements suggest an Asian urban setting."
|
901 |
+
|
902 |
+
return street_desc
|
903 |
+
|
904 |
+
# Default general description
|
905 |
+
return "The scene contains various elements characteristic of this environment."
|
906 |
+
|
907 |
+
def _detect_viewpoint(self, detected_objects: List[Dict]) -> str:
|
908 |
+
"""
|
909 |
+
改進視角檢測,特別加強對空中俯視視角的識別。
|
910 |
+
|
911 |
+
Args:
|
912 |
+
detected_objects: 檢測到的物體列表
|
913 |
+
|
914 |
+
Returns:
|
915 |
+
str: 檢測到的視角類型
|
916 |
+
"""
|
917 |
+
if not detected_objects:
|
918 |
+
return "eye_level" # default
|
919 |
+
|
920 |
+
# 提取物體位置和大小
|
921 |
+
top_region_count = 0
|
922 |
+
bottom_region_count = 0
|
923 |
+
total_objects = len(detected_objects)
|
924 |
+
|
925 |
+
# 追蹤大小分布以檢測空中視角
|
926 |
+
sizes = []
|
927 |
+
|
928 |
+
# 垂直大小比例用於低角度檢測
|
929 |
+
height_width_ratios = []
|
930 |
+
|
931 |
+
# 用於檢測規則圖案的變數
|
932 |
+
people_positions = []
|
933 |
+
crosswalk_pattern_detected = False
|
934 |
+
|
935 |
+
for obj in detected_objects:
|
936 |
+
# 計算頂部/底部區域中的物體
|
937 |
+
region = obj["region"]
|
938 |
+
if "top" in region:
|
939 |
+
top_region_count += 1
|
940 |
+
elif "bottom" in region:
|
941 |
+
bottom_region_count += 1
|
942 |
+
|
943 |
+
# 計算標準化大小(面積)
|
944 |
+
if "normalized_area" in obj:
|
945 |
+
sizes.append(obj["normalized_area"])
|
946 |
+
|
947 |
+
# 計算高度/寬度比例
|
948 |
+
if "normalized_size" in obj:
|
949 |
+
width, height = obj["normalized_size"]
|
950 |
+
if width > 0:
|
951 |
+
height_width_ratios.append(height / width)
|
952 |
+
|
953 |
+
# 收集人的位置用於圖案檢測
|
954 |
+
if obj["class_id"] == 0: # 人
|
955 |
+
if "normalized_center" in obj:
|
956 |
+
people_positions.append(obj["normalized_center"])
|
957 |
+
|
958 |
+
# 專門為斑馬線十字路口添加檢測邏輯
|
959 |
+
# 檢查是否有明顯的垂直和水平行人分布
|
960 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0] # 人
|
961 |
+
|
962 |
+
if len(people_objs) >= 8: # 需要足夠多的人才能形成十字路口模式
|
963 |
+
# 檢查是否有斑馬線模式 - 新增功能
|
964 |
+
if len(people_positions) >= 4:
|
965 |
+
# 對位置進行聚類分析,尋找線性分布
|
966 |
+
x_coords = [pos[0] for pos in people_positions]
|
967 |
+
y_coords = [pos[1] for pos in people_positions]
|
968 |
+
|
969 |
+
# 計算 x 和 y 坐標的變異數和範圍
|
970 |
+
x_variance = np.var(x_coords) if len(x_coords) > 1 else 0
|
971 |
+
y_variance = np.var(y_coords) if len(y_coords) > 1 else 0
|
972 |
+
|
973 |
+
x_range = max(x_coords) - min(x_coords)
|
974 |
+
y_range = max(y_coords) - min(y_coords)
|
975 |
+
|
976 |
+
# 嘗試檢測十字形分布
|
977 |
+
# 如果 x 和 y 方向都有較大範圍,且範圍相似,可能是十字路口
|
978 |
+
if x_range > 0.5 and y_range > 0.5 and 0.7 < (x_range / y_range) < 1.3:
|
979 |
+
|
980 |
+
# 計算到中心點的距離
|
981 |
+
center_x = np.mean(x_coords)
|
982 |
+
center_y = np.mean(y_coords)
|
983 |
+
|
984 |
+
# 將點映射到十字架的軸上(水平和垂直)
|
985 |
+
x_axis_distance = [abs(x - center_x) for x in x_coords]
|
986 |
+
y_axis_distance = [abs(y - center_y) for y in y_coords]
|
987 |
+
|
988 |
+
# 點應該接近軸線(水平或垂直)
|
989 |
+
# 對於每個點,檢查它是否接近水平或垂直軸線
|
990 |
+
close_to_axis_count = 0
|
991 |
+
for i in range(len(x_coords)):
|
992 |
+
if x_axis_distance[i] < 0.1 or y_axis_distance[i] < 0.1:
|
993 |
+
close_to_axis_count += 1
|
994 |
+
|
995 |
+
# 如果足夠多的點接近軸線,認為是十字路口
|
996 |
+
if close_to_axis_count >= len(x_coords) * 0.6:
|
997 |
+
crosswalk_pattern_detected = True
|
998 |
+
|
999 |
+
# 如果沒有檢測到十字形,嘗試檢測線性聚類分布
|
1000 |
+
if not crosswalk_pattern_detected:
|
1001 |
+
# 檢查 x 和 y 方向的聚類
|
1002 |
+
x_clusters = self._detect_linear_clusters(x_coords)
|
1003 |
+
y_clusters = self._detect_linear_clusters(y_coords)
|
1004 |
+
|
1005 |
+
# 如果在 x 和 y 方向上都有多個聚類,可能是交叉的斑馬線
|
1006 |
+
if len(x_clusters) >= 2 and len(y_clusters) >= 2:
|
1007 |
+
crosswalk_pattern_detected = True
|
1008 |
+
|
1009 |
+
# 檢測斑馬線模式 - 優先判斷
|
1010 |
+
if crosswalk_pattern_detected:
|
1011 |
+
return "aerial"
|
1012 |
+
|
1013 |
+
# 檢測行人分布情況
|
1014 |
+
if len(people_objs) >= 10:
|
1015 |
+
people_region_counts = {}
|
1016 |
+
for obj in people_objs:
|
1017 |
+
region = obj["region"]
|
1018 |
+
if region not in people_region_counts:
|
1019 |
+
people_region_counts[region] = 0
|
1020 |
+
people_region_counts[region] += 1
|
1021 |
+
|
1022 |
+
# 計算不同區域中的行人數量
|
1023 |
+
region_count = len([r for r, c in people_region_counts.items() if c >= 2])
|
1024 |
+
|
1025 |
+
# 如果行人分布在多個區域中,可能是空中視角
|
1026 |
+
if region_count >= 4:
|
1027 |
+
# 檢查行人分布的模式
|
1028 |
+
# 特別是檢查不同區域中行人數量的差異
|
1029 |
+
region_counts = list(people_region_counts.values())
|
1030 |
+
region_counts_variance = np.var(region_counts) if len(region_counts) > 1 else 0
|
1031 |
+
region_counts_mean = np.mean(region_counts) if region_counts else 0
|
1032 |
+
|
1033 |
+
# 如果行人分布較為均勻(變異係數小),可能是空中視角
|
1034 |
+
if region_counts_mean > 0:
|
1035 |
+
variation_coefficient = region_counts_variance / region_counts_mean
|
1036 |
+
if variation_coefficient < 0.5:
|
1037 |
+
return "aerial"
|
1038 |
+
|
1039 |
+
# 計算指標
|
1040 |
+
top_ratio = top_region_count / total_objects if total_objects > 0 else 0
|
1041 |
+
bottom_ratio = bottom_region_count / total_objects if total_objects > 0 else 0
|
1042 |
+
|
1043 |
+
# 大小變異數(標準化)
|
1044 |
+
size_variance = 0
|
1045 |
+
if sizes:
|
1046 |
+
mean_size = sum(sizes) / len(sizes)
|
1047 |
+
size_variance = sum((s - mean_size) ** 2 for s in sizes) / len(sizes)
|
1048 |
+
size_variance = size_variance / (mean_size ** 2) # 標準化
|
1049 |
+
|
1050 |
+
# 平均高度/寬度比例
|
1051 |
+
avg_height_width_ratio = sum(height_width_ratios) / len(height_width_ratios) if height_width_ratios else 1.0
|
1052 |
+
|
1053 |
+
# 空中視角:低大小差異,物體均勻分布,底部很少或沒有物體
|
1054 |
+
if (size_variance < self.viewpoint_params["aerial_size_variance_threshold"] and
|
1055 |
+
bottom_ratio < 0.3 and top_ratio > self.viewpoint_params["aerial_threshold"]):
|
1056 |
+
return "aerial"
|
1057 |
+
|
1058 |
+
# 低角度視角:物體傾向於比寬高,頂部較多物體
|
1059 |
+
elif (avg_height_width_ratio > self.viewpoint_params["vertical_size_ratio_threshold"] and
|
1060 |
+
top_ratio > self.viewpoint_params["low_angle_threshold"]):
|
1061 |
+
return "low_angle"
|
1062 |
+
|
1063 |
+
# 高視角:底部較多物體,頂部較少
|
1064 |
+
elif (bottom_ratio > self.viewpoint_params["elevated_threshold"] and
|
1065 |
+
top_ratio < self.viewpoint_params["elevated_top_threshold"]):
|
1066 |
+
return "elevated"
|
1067 |
+
|
1068 |
+
# 默認:平視角
|
1069 |
+
return "eye_level"
|
1070 |
+
|
1071 |
+
def _detect_linear_clusters(self, coords, threshold=0.05):
|
1072 |
+
"""
|
1073 |
+
檢測坐標中的線性聚類
|
1074 |
+
|
1075 |
+
Args:
|
1076 |
+
coords: 一維坐標列表
|
1077 |
+
threshold: 聚類閾值
|
1078 |
+
|
1079 |
+
Returns:
|
1080 |
+
list: 聚類列表
|
1081 |
+
"""
|
1082 |
+
if not coords:
|
1083 |
+
return []
|
1084 |
+
|
1085 |
+
# 排序坐標
|
1086 |
+
sorted_coords = sorted(coords)
|
1087 |
+
|
1088 |
+
clusters = []
|
1089 |
+
current_cluster = [sorted_coords[0]]
|
1090 |
+
|
1091 |
+
for i in range(1, len(sorted_coords)):
|
1092 |
+
# 如果當前坐標與前一個接近,添加到當前聚類
|
1093 |
+
if sorted_coords[i] - sorted_coords[i-1] < threshold:
|
1094 |
+
current_cluster.append(sorted_coords[i])
|
1095 |
+
else:
|
1096 |
+
# 否則開始新的聚類
|
1097 |
+
if len(current_cluster) >= 2: # 至少需要2個點形成聚類
|
1098 |
+
clusters.append(current_cluster)
|
1099 |
+
current_cluster = [sorted_coords[i]]
|
1100 |
+
|
1101 |
+
# 添加最後一個cluster
|
1102 |
+
if len(current_cluster) >= 2:
|
1103 |
+
clusters.append(current_cluster)
|
1104 |
+
|
1105 |
+
return clusters
|
1106 |
+
|
1107 |
+
def _detect_cultural_context(self, scene_type: str, detected_objects: List[Dict]) -> Optional[str]:
|
1108 |
+
"""
|
1109 |
+
Detect the likely cultural context of the scene.
|
1110 |
+
|
1111 |
+
Args:
|
1112 |
+
scene_type: Identified scene type
|
1113 |
+
detected_objects: List of detected objects
|
1114 |
+
|
1115 |
+
Returns:
|
1116 |
+
Optional[str]: Detected cultural context (asian, european, etc.) or None
|
1117 |
+
"""
|
1118 |
+
# Scene types with explicit cultural contexts
|
1119 |
+
cultural_scene_mapping = {
|
1120 |
+
"asian_commercial_street": "asian",
|
1121 |
+
"asian_night_market": "asian",
|
1122 |
+
"asian_temple_area": "asian",
|
1123 |
+
"european_plaza": "european"
|
1124 |
+
}
|
1125 |
+
|
1126 |
+
# Check if scene type directly indicates cultural context
|
1127 |
+
if scene_type in cultural_scene_mapping:
|
1128 |
+
return cultural_scene_mapping[scene_type]
|
1129 |
+
|
1130 |
+
# No specific cultural context detected
|
1131 |
+
return None
|
1132 |
+
|
1133 |
+
def _generate_cultural_elements(self, cultural_context: str) -> str:
|
1134 |
+
"""
|
1135 |
+
Generate description of cultural elements for the detected context.
|
1136 |
+
|
1137 |
+
Args:
|
1138 |
+
cultural_context: Detected cultural context
|
1139 |
+
|
1140 |
+
Returns:
|
1141 |
+
str: Description of cultural elements
|
1142 |
+
"""
|
1143 |
+
# Get template for this cultural context
|
1144 |
+
cultural_templates = self.templates.get("cultural_templates", {})
|
1145 |
+
|
1146 |
+
if cultural_context in cultural_templates:
|
1147 |
+
template = cultural_templates[cultural_context]
|
1148 |
+
elements = template.get("elements", [])
|
1149 |
+
|
1150 |
+
if elements:
|
1151 |
+
# Select 1-2 random elements
|
1152 |
+
num_elements = min(len(elements), random.randint(1, 2))
|
1153 |
+
selected_elements = random.sample(elements, num_elements)
|
1154 |
+
|
1155 |
+
# Format elements list
|
1156 |
+
elements_text = " and ".join(selected_elements) if num_elements == 2 else selected_elements[0]
|
1157 |
+
|
1158 |
+
# Fill template
|
1159 |
+
return template.get("description", "").format(elements=elements_text)
|
1160 |
+
|
1161 |
+
return ""
|
1162 |
+
|
1163 |
+
def _optimize_object_description(self, description: str) -> str:
|
1164 |
+
"""
|
1165 |
+
優化物品描述,避免重複列舉相同物品
|
1166 |
+
"""
|
1167 |
+
import re
|
1168 |
+
|
1169 |
+
# 處理床鋪重複描述
|
1170 |
+
if "bed in the room" in description:
|
1171 |
+
description = description.replace("a bed in the room", "a bed")
|
1172 |
+
|
1173 |
+
# 處理重複的物品列表
|
1174 |
+
# 尋找格式如 "item, item, item" 的模式
|
1175 |
+
object_lists = re.findall(r'with ([^\.]+?)(?:\.|\band\b)', description)
|
1176 |
+
|
1177 |
+
for obj_list in object_lists:
|
1178 |
+
# 計算每個物品出現次數
|
1179 |
+
items = re.findall(r'([a-zA-Z\s]+)(?:,|\band\b|$)', obj_list)
|
1180 |
+
item_counts = {}
|
1181 |
+
|
1182 |
+
for item in items:
|
1183 |
+
item = item.strip()
|
1184 |
+
if item and item not in ["and", "with"]:
|
1185 |
+
if item not in item_counts:
|
1186 |
+
item_counts[item] = 0
|
1187 |
+
item_counts[item] += 1
|
1188 |
+
|
1189 |
+
# 生成優化後的物品列表
|
1190 |
+
if item_counts:
|
1191 |
+
new_items = []
|
1192 |
+
for item, count in item_counts.items():
|
1193 |
+
if count > 1:
|
1194 |
+
new_items.append(f"{count} {item}s")
|
1195 |
+
else:
|
1196 |
+
new_items.append(item)
|
1197 |
+
|
1198 |
+
# 格式化新列表
|
1199 |
+
if len(new_items) == 1:
|
1200 |
+
new_list = new_items[0]
|
1201 |
+
elif len(new_items) == 2:
|
1202 |
+
new_list = f"{new_items[0]} and {new_items[1]}"
|
1203 |
+
else:
|
1204 |
+
new_list = ", ".join(new_items[:-1]) + f", and {new_items[-1]}"
|
1205 |
+
|
1206 |
+
# 替換原始列表
|
1207 |
+
description = description.replace(obj_list, new_list)
|
1208 |
+
|
1209 |
+
return description
|
1210 |
+
|
1211 |
+
def _describe_functional_zones(self, functional_zones: Dict) -> str:
|
1212 |
+
"""
|
1213 |
+
生成場景功能區域的描述,優化處理行人區域、人數統計和物品重複問題。
|
1214 |
+
|
1215 |
+
Args:
|
1216 |
+
functional_zones: 識別出的功能區域字典
|
1217 |
+
|
1218 |
+
Returns:
|
1219 |
+
str: 功能區域描述
|
1220 |
+
"""
|
1221 |
+
if not functional_zones:
|
1222 |
+
return ""
|
1223 |
+
|
1224 |
+
# 計算場景中的總人數
|
1225 |
+
total_people_count = 0
|
1226 |
+
people_by_zone = {}
|
1227 |
+
|
1228 |
+
# 計算每個區域的人數並累計總人數
|
1229 |
+
for zone_name, zone_info in functional_zones.items():
|
1230 |
+
if "objects" in zone_info:
|
1231 |
+
zone_people_count = zone_info["objects"].count("person")
|
1232 |
+
people_by_zone[zone_name] = zone_people_count
|
1233 |
+
total_people_count += zone_people_count
|
1234 |
+
|
1235 |
+
# 分類區域為行人區域和其他區域
|
1236 |
+
pedestrian_zones = []
|
1237 |
+
other_zones = []
|
1238 |
+
|
1239 |
+
for zone_name, zone_info in functional_zones.items():
|
1240 |
+
# 檢查是否是行人相關區域
|
1241 |
+
if any(keyword in zone_name.lower() for keyword in ["pedestrian", "crossing", "people"]):
|
1242 |
+
pedestrian_zones.append((zone_name, zone_info))
|
1243 |
+
else:
|
1244 |
+
other_zones.append((zone_name, zone_info))
|
1245 |
+
|
1246 |
+
# 獲取最重要的行人區域和其他區域
|
1247 |
+
main_pedestrian_zones = sorted(pedestrian_zones,
|
1248 |
+
key=lambda z: people_by_zone.get(z[0], 0),
|
1249 |
+
reverse=True)[:1] # 最多1個主要行人區域
|
1250 |
+
|
1251 |
+
top_other_zones = sorted(other_zones,
|
1252 |
+
key=lambda z: len(z[1].get("objects", [])),
|
1253 |
+
reverse=True)[:2] # 最多2個其他區域
|
1254 |
+
|
1255 |
+
# 合併區域
|
1256 |
+
top_zones = main_pedestrian_zones + top_other_zones
|
1257 |
+
|
1258 |
+
if not top_zones:
|
1259 |
+
return ""
|
1260 |
+
|
1261 |
+
# 生成匯總描述
|
1262 |
+
summary = ""
|
1263 |
+
max_mentioned_people = 0 # 跟踪已經提到的最大人數
|
1264 |
+
|
1265 |
+
# 如果總人數顯著且還沒在主描述中提到,添加總人數描述
|
1266 |
+
if total_people_count > 5:
|
1267 |
+
summary = f"The scene contains a significant number of pedestrians ({total_people_count} people). "
|
1268 |
+
max_mentioned_people = total_people_count # 更新已提到的最大人數
|
1269 |
+
|
1270 |
+
# 處理每個區域的描述,確保人數信息的一致性
|
1271 |
+
processed_zones = []
|
1272 |
+
|
1273 |
+
for zone_name, zone_info in top_zones:
|
1274 |
+
zone_desc = zone_info.get("description", "a functional zone")
|
1275 |
+
zone_people_count = people_by_zone.get(zone_name, 0)
|
1276 |
+
|
1277 |
+
# 檢查描述中是否包含人數信息
|
1278 |
+
contains_people_info = "with" in zone_desc and ("person" in zone_desc.lower() or "people" in zone_desc.lower())
|
1279 |
+
|
1280 |
+
# 如果描述包含人數信息,且人數較小(小於已提到的最大人數),則修改描述
|
1281 |
+
if contains_people_info and zone_people_count < max_mentioned_people:
|
1282 |
+
parts = zone_desc.split("with")
|
1283 |
+
if len(parts) > 1:
|
1284 |
+
# 移除人數部分
|
1285 |
+
zone_desc = parts[0].strip() + " area"
|
1286 |
+
|
1287 |
+
processed_zones.append((zone_name, {"description": zone_desc}))
|
1288 |
+
|
1289 |
+
# 根據處理後的區域數量生成最終描述
|
1290 |
+
final_desc = ""
|
1291 |
+
|
1292 |
+
if len(processed_zones) == 1:
|
1293 |
+
_, zone_info = processed_zones[0]
|
1294 |
+
zone_desc = zone_info["description"]
|
1295 |
+
final_desc = summary + f"The scene includes {zone_desc}."
|
1296 |
+
elif len(processed_zones) == 2:
|
1297 |
+
_, zone1_info = processed_zones[0]
|
1298 |
+
_, zone2_info = processed_zones[1]
|
1299 |
+
zone1_desc = zone1_info["description"]
|
1300 |
+
zone2_desc = zone2_info["description"]
|
1301 |
+
final_desc = summary + f"The scene is divided into two main areas: {zone1_desc} and {zone2_desc}."
|
1302 |
+
else:
|
1303 |
+
zones_desc = ["The scene contains multiple functional areas including"]
|
1304 |
+
zone_descriptions = [z[1]["description"] for z in processed_zones]
|
1305 |
+
|
1306 |
+
# 格式化最終的多區域描述
|
1307 |
+
if len(zone_descriptions) == 3:
|
1308 |
+
formatted_desc = f"{zone_descriptions[0]}, {zone_descriptions[1]}, and {zone_descriptions[2]}"
|
1309 |
+
else:
|
1310 |
+
formatted_desc = ", ".join(zone_descriptions[:-1]) + f", and {zone_descriptions[-1]}"
|
1311 |
+
|
1312 |
+
final_desc = summary + f"{zones_desc[0]} {formatted_desc}."
|
1313 |
+
|
1314 |
+
return self._optimize_object_description(final_desc)
|
image_processor.py
CHANGED
@@ -11,64 +11,125 @@ from detection_model import DetectionModel
|
|
11 |
from color_mapper import ColorMapper
|
12 |
from visualization_helper import VisualizationHelper
|
13 |
from evaluation_metrics import EvaluationMetrics
|
|
|
|
|
14 |
|
15 |
class ImageProcessor:
|
16 |
"""
|
17 |
Class for handling image processing and object detection operations
|
18 |
Separates processing logic from UI components
|
19 |
"""
|
20 |
-
|
21 |
def __init__(self):
|
22 |
"""Initialize the image processor with required components"""
|
23 |
self.color_mapper = ColorMapper()
|
24 |
self.model_instances = {}
|
25 |
-
|
26 |
-
|
|
|
27 |
"""
|
28 |
Get or create a model instance based on model name
|
29 |
-
|
30 |
Args:
|
31 |
model_name: Name of the model to use
|
32 |
confidence: Confidence threshold for detection
|
33 |
iou: IoU threshold for non-maximum suppression
|
34 |
-
|
35 |
Returns:
|
36 |
DetectionModel instance
|
37 |
"""
|
38 |
if model_name not in self.model_instances:
|
39 |
print(f"Creating new model instance for {model_name}")
|
40 |
self.model_instances[model_name] = DetectionModel(
|
41 |
-
model_name=model_name,
|
42 |
-
confidence=confidence,
|
43 |
iou=iou
|
44 |
)
|
45 |
else:
|
46 |
print(f"Using existing model instance for {model_name}")
|
47 |
self.model_instances[model_name].confidence = confidence
|
48 |
-
|
49 |
return self.model_instances[model_name]
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
|
52 |
"""
|
53 |
Process an image for object detection
|
54 |
-
|
55 |
Args:
|
56 |
image: Input image (numpy array or PIL Image)
|
57 |
model_name: Name of the model to use
|
58 |
confidence_threshold: Confidence threshold for detection
|
59 |
filter_classes: Optional list of classes to filter results
|
60 |
-
|
61 |
Returns:
|
62 |
Tuple of (result_image, result_text, stats_data)
|
63 |
"""
|
64 |
# Get model instance
|
65 |
model_instance = self.get_model_instance(model_name, confidence_threshold)
|
66 |
-
|
67 |
# Initialize key variables
|
68 |
result = None
|
69 |
stats = {}
|
70 |
temp_path = None
|
71 |
-
|
72 |
try:
|
73 |
# Processing input image
|
74 |
if isinstance(image, np.ndarray):
|
@@ -82,44 +143,51 @@ class ImageProcessor:
|
|
82 |
return None, "No image provided. Please upload an image.", {}
|
83 |
else:
|
84 |
pil_image = image
|
85 |
-
|
|
|
|
|
|
|
86 |
# Store temp files
|
87 |
temp_dir = tempfile.gettempdir() # Use system temp directory
|
88 |
temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
|
89 |
temp_path = os.path.join(temp_dir, temp_filename)
|
90 |
pil_image.save(temp_path)
|
91 |
-
|
92 |
# Object detection
|
93 |
result = model_instance.detect(temp_path)
|
94 |
-
|
95 |
if result is None:
|
96 |
return None, "Detection failed. Please try again with a different image.", {}
|
97 |
-
|
98 |
# Calculate stats
|
99 |
stats = EvaluationMetrics.calculate_basic_stats(result)
|
100 |
-
|
101 |
# Add space calculation
|
102 |
spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
|
103 |
stats["spatial_metrics"] = spatial_metrics
|
104 |
-
|
|
|
|
|
|
|
105 |
# Apply filter if specified
|
106 |
if filter_classes and len(filter_classes) > 0:
|
107 |
# Get classes, boxes, confidence
|
108 |
classes = result.boxes.cls.cpu().numpy().astype(int)
|
109 |
confs = result.boxes.conf.cpu().numpy()
|
110 |
boxes = result.boxes.xyxy.cpu().numpy()
|
111 |
-
|
112 |
mask = np.zeros_like(classes, dtype=bool)
|
113 |
for cls_id in filter_classes:
|
114 |
mask = np.logical_or(mask, classes == cls_id)
|
115 |
-
|
116 |
filtered_stats = {
|
117 |
"total_objects": int(np.sum(mask)),
|
118 |
"class_statistics": {},
|
119 |
"average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
|
120 |
-
"spatial_metrics": stats["spatial_metrics"]
|
|
|
121 |
}
|
122 |
-
|
123 |
# Update stats
|
124 |
names = result.names
|
125 |
for cls, conf in zip(classes[mask], confs[mask]):
|
@@ -129,59 +197,67 @@ class ImageProcessor:
|
|
129 |
"count": 0,
|
130 |
"average_confidence": 0
|
131 |
}
|
132 |
-
|
133 |
filtered_stats["class_statistics"][cls_name]["count"] += 1
|
134 |
filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
|
135 |
-
|
136 |
stats = filtered_stats
|
137 |
-
|
138 |
viz_data = EvaluationMetrics.generate_visualization_data(
|
139 |
result,
|
140 |
self.color_mapper.get_all_colors()
|
141 |
)
|
142 |
-
|
143 |
result_image = VisualizationHelper.visualize_detection(
|
144 |
temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
|
145 |
)
|
146 |
-
|
147 |
result_text = EvaluationMetrics.format_detection_summary(viz_data)
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
return result_image, result_text, stats
|
150 |
-
|
151 |
except Exception as e:
|
152 |
error_message = f"Error Occurs: {str(e)}"
|
153 |
import traceback
|
154 |
traceback.print_exc()
|
155 |
print(error_message)
|
156 |
return None, error_message, {}
|
157 |
-
|
158 |
finally:
|
159 |
if temp_path and os.path.exists(temp_path):
|
160 |
try:
|
161 |
os.remove(temp_path)
|
162 |
except Exception as e:
|
163 |
print(f"Cannot delete temp files {temp_path}: {str(e)}")
|
164 |
-
|
|
|
165 |
def format_result_text(self, stats: Dict) -> str:
|
166 |
"""
|
167 |
Format detection statistics into readable text with improved spacing
|
168 |
-
|
169 |
Args:
|
170 |
stats: Dictionary containing detection statistics
|
171 |
-
|
172 |
Returns:
|
173 |
Formatted text summary
|
174 |
"""
|
175 |
if not stats or "total_objects" not in stats:
|
176 |
return "No objects detected."
|
177 |
-
|
178 |
# 減少不必要的空行
|
179 |
lines = [
|
180 |
f"Detected {stats['total_objects']} objects.",
|
181 |
f"Average confidence: {stats.get('average_confidence', 0):.2f}",
|
182 |
"Objects by class:"
|
183 |
]
|
184 |
-
|
185 |
if "class_statistics" in stats and stats["class_statistics"]:
|
186 |
# 按計數排序類別
|
187 |
sorted_classes = sorted(
|
@@ -189,24 +265,24 @@ class ImageProcessor:
|
|
189 |
key=lambda x: x[1]["count"],
|
190 |
reverse=True
|
191 |
)
|
192 |
-
|
193 |
for cls_name, cls_stats in sorted_classes:
|
194 |
count = cls_stats["count"]
|
195 |
conf = cls_stats.get("average_confidence", 0)
|
196 |
-
|
197 |
item_text = "item" if count == 1 else "items"
|
198 |
lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
|
199 |
else:
|
200 |
lines.append("No class information available.")
|
201 |
-
|
202 |
# 添加空間信息
|
203 |
if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
|
204 |
lines.append("Object Distribution:")
|
205 |
-
|
206 |
dist = stats["spatial_metrics"]["spatial_distribution"]
|
207 |
x_mean = dist.get("x_mean", 0)
|
208 |
y_mean = dist.get("y_mean", 0)
|
209 |
-
|
210 |
# 描述物體的大致位置
|
211 |
if x_mean < 0.33:
|
212 |
h_pos = "on the left side"
|
@@ -214,37 +290,37 @@ class ImageProcessor:
|
|
214 |
h_pos = "in the center"
|
215 |
else:
|
216 |
h_pos = "on the right side"
|
217 |
-
|
218 |
if y_mean < 0.33:
|
219 |
v_pos = "in the upper part"
|
220 |
elif y_mean < 0.67:
|
221 |
v_pos = "in the middle"
|
222 |
else:
|
223 |
v_pos = "in the lower part"
|
224 |
-
|
225 |
lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")
|
226 |
-
|
227 |
return "\n".join(lines)
|
228 |
-
|
229 |
def format_json_for_display(self, stats: Dict) -> Dict:
|
230 |
"""
|
231 |
Format statistics JSON for better display
|
232 |
-
|
233 |
Args:
|
234 |
stats: Raw statistics dictionary
|
235 |
-
|
236 |
Returns:
|
237 |
Formatted statistics structure for display
|
238 |
"""
|
239 |
# Create a cleaner copy of the stats for display
|
240 |
display_stats = {}
|
241 |
-
|
242 |
# Add summary section
|
243 |
display_stats["summary"] = {
|
244 |
"total_objects": stats.get("total_objects", 0),
|
245 |
"average_confidence": round(stats.get("average_confidence", 0), 3)
|
246 |
}
|
247 |
-
|
248 |
# Add class statistics in a more organized way
|
249 |
if "class_statistics" in stats and stats["class_statistics"]:
|
250 |
# Sort classes by count (descending)
|
@@ -253,20 +329,20 @@ class ImageProcessor:
|
|
253 |
key=lambda x: x[1].get("count", 0),
|
254 |
reverse=True
|
255 |
)
|
256 |
-
|
257 |
class_stats = {}
|
258 |
for cls_name, cls_data in sorted_classes:
|
259 |
class_stats[cls_name] = {
|
260 |
"count": cls_data.get("count", 0),
|
261 |
"average_confidence": round(cls_data.get("average_confidence", 0), 3)
|
262 |
}
|
263 |
-
|
264 |
display_stats["detected_objects"] = class_stats
|
265 |
-
|
266 |
# Simplify spatial metrics
|
267 |
if "spatial_metrics" in stats:
|
268 |
spatial = stats["spatial_metrics"]
|
269 |
-
|
270 |
# Simplify spatial distribution
|
271 |
if "spatial_distribution" in spatial:
|
272 |
dist = spatial["spatial_distribution"]
|
@@ -278,7 +354,7 @@ class ImageProcessor:
|
|
278 |
"y_std": round(dist.get("y_std", 0), 3)
|
279 |
}
|
280 |
}
|
281 |
-
|
282 |
# Add simplified size information
|
283 |
if "size_distribution" in spatial:
|
284 |
size = spatial["size_distribution"]
|
@@ -287,30 +363,30 @@ class ImageProcessor:
|
|
287 |
"min_area": round(size.get("min_area", 0), 3),
|
288 |
"max_area": round(size.get("max_area", 0), 3)
|
289 |
}
|
290 |
-
|
291 |
return display_stats
|
292 |
-
|
293 |
def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
|
294 |
"""
|
295 |
Prepare data for visualization based on detection statistics
|
296 |
-
|
297 |
Args:
|
298 |
stats: Detection statistics
|
299 |
available_classes: Dictionary of available class IDs and names
|
300 |
-
|
301 |
Returns:
|
302 |
Visualization data dictionary
|
303 |
"""
|
304 |
if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
|
305 |
return {"error": "No detection data available"}
|
306 |
-
|
307 |
# Prepare visualization data
|
308 |
viz_data = {
|
309 |
"total_objects": stats.get("total_objects", 0),
|
310 |
"average_confidence": stats.get("average_confidence", 0),
|
311 |
"class_data": []
|
312 |
}
|
313 |
-
|
314 |
# Class data
|
315 |
for cls_name, cls_stats in stats.get("class_statistics", {}).items():
|
316 |
# Search class ID
|
@@ -319,7 +395,7 @@ class ImageProcessor:
|
|
319 |
if name == cls_name:
|
320 |
class_id = id
|
321 |
break
|
322 |
-
|
323 |
cls_data = {
|
324 |
"name": cls_name,
|
325 |
"class_id": class_id,
|
@@ -327,10 +403,10 @@ class ImageProcessor:
|
|
327 |
"average_confidence": cls_stats.get("average_confidence", 0),
|
328 |
"color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
|
329 |
}
|
330 |
-
|
331 |
viz_data["class_data"].append(cls_data)
|
332 |
-
|
333 |
# Descending order
|
334 |
viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)
|
335 |
-
|
336 |
return viz_data
|
|
|
11 |
from color_mapper import ColorMapper
|
12 |
from visualization_helper import VisualizationHelper
|
13 |
from evaluation_metrics import EvaluationMetrics
|
14 |
+
from lighting_analyzer import LightingAnalyzer
|
15 |
+
from scene_analyzer import SceneAnalyzer
|
16 |
|
17 |
class ImageProcessor:
|
18 |
"""
|
19 |
Class for handling image processing and object detection operations
|
20 |
Separates processing logic from UI components
|
21 |
"""
|
22 |
+
|
23 |
def __init__(self):
|
24 |
"""Initialize the image processor with required components"""
|
25 |
self.color_mapper = ColorMapper()
|
26 |
self.model_instances = {}
|
27 |
+
self.lighting_analyzer = LightingAnalyzer()
|
28 |
+
|
29 |
+
def get_model_instance(self, model_name: str, confidence: float = 0.25, iou: float = 0.25) -> DetectionModel:
|
30 |
"""
|
31 |
Get or create a model instance based on model name
|
32 |
+
|
33 |
Args:
|
34 |
model_name: Name of the model to use
|
35 |
confidence: Confidence threshold for detection
|
36 |
iou: IoU threshold for non-maximum suppression
|
37 |
+
|
38 |
Returns:
|
39 |
DetectionModel instance
|
40 |
"""
|
41 |
if model_name not in self.model_instances:
|
42 |
print(f"Creating new model instance for {model_name}")
|
43 |
self.model_instances[model_name] = DetectionModel(
|
44 |
+
model_name=model_name,
|
45 |
+
confidence=confidence,
|
46 |
iou=iou
|
47 |
)
|
48 |
else:
|
49 |
print(f"Using existing model instance for {model_name}")
|
50 |
self.model_instances[model_name].confidence = confidence
|
51 |
+
|
52 |
return self.model_instances[model_name]
|
53 |
+
|
54 |
+
def analyze_scene(self, detection_result: Any, lighting_info: Optional[Dict] = None) -> Dict:
|
55 |
+
"""
|
56 |
+
Perform scene analysis on detection results
|
57 |
+
|
58 |
+
Args:
|
59 |
+
detection_result: Object detection result from YOLOv8
|
60 |
+
lighting_info: Lighting condition analysis results (optional)
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
Dictionary containing scene analysis results
|
64 |
+
"""
|
65 |
+
try:
|
66 |
+
# Initialize scene analyzer if not already done
|
67 |
+
if not hasattr(self, 'scene_analyzer'):
|
68 |
+
self.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
|
69 |
+
|
70 |
+
# 確保類名正確更新
|
71 |
+
if self.scene_analyzer.class_names is None:
|
72 |
+
self.scene_analyzer.class_names = detection_result.names
|
73 |
+
self.scene_analyzer.spatial_analyzer.class_names = detection_result.names
|
74 |
+
|
75 |
+
# Perform scene analysis with lighting info
|
76 |
+
scene_analysis = self.scene_analyzer.analyze(
|
77 |
+
detection_result=detection_result,
|
78 |
+
lighting_info=lighting_info,
|
79 |
+
class_confidence_threshold=0.35,
|
80 |
+
scene_confidence_threshold=0.6
|
81 |
+
)
|
82 |
+
|
83 |
+
return scene_analysis
|
84 |
+
except Exception as e:
|
85 |
+
print(f"Error in scene analysis: {str(e)}")
|
86 |
+
import traceback
|
87 |
+
traceback.print_exc()
|
88 |
+
return {
|
89 |
+
"scene_type": "unknown",
|
90 |
+
"confidence": 0.0,
|
91 |
+
"description": f"Error during scene analysis: {str(e)}",
|
92 |
+
"objects_present": [],
|
93 |
+
"object_count": 0,
|
94 |
+
"regions": {},
|
95 |
+
"possible_activities": [],
|
96 |
+
"safety_concerns": [],
|
97 |
+
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
|
98 |
+
}
|
99 |
+
|
100 |
+
def analyze_lighting_conditions(self, image):
|
101 |
+
"""
|
102 |
+
分析光照條件。
|
103 |
+
|
104 |
+
Args:
|
105 |
+
image: 輸入圖像
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
Dict: 光照分析結果
|
109 |
+
"""
|
110 |
+
return self.lighting_analyzer.analyze(image)
|
111 |
+
|
112 |
def process_image(self, image, model_name: str, confidence_threshold: float, filter_classes: Optional[List[int]] = None) -> Tuple[Any, str, Dict]:
|
113 |
"""
|
114 |
Process an image for object detection
|
115 |
+
|
116 |
Args:
|
117 |
image: Input image (numpy array or PIL Image)
|
118 |
model_name: Name of the model to use
|
119 |
confidence_threshold: Confidence threshold for detection
|
120 |
filter_classes: Optional list of classes to filter results
|
121 |
+
|
122 |
Returns:
|
123 |
Tuple of (result_image, result_text, stats_data)
|
124 |
"""
|
125 |
# Get model instance
|
126 |
model_instance = self.get_model_instance(model_name, confidence_threshold)
|
127 |
+
|
128 |
# Initialize key variables
|
129 |
result = None
|
130 |
stats = {}
|
131 |
temp_path = None
|
132 |
+
|
133 |
try:
|
134 |
# Processing input image
|
135 |
if isinstance(image, np.ndarray):
|
|
|
143 |
return None, "No image provided. Please upload an image.", {}
|
144 |
else:
|
145 |
pil_image = image
|
146 |
+
|
147 |
+
# Analyze lighting conditions
|
148 |
+
lighting_info = self.analyze_lighting_conditions(pil_image)
|
149 |
+
|
150 |
# Store temp files
|
151 |
temp_dir = tempfile.gettempdir() # Use system temp directory
|
152 |
temp_filename = f"temp_{uuid.uuid4().hex}.jpg"
|
153 |
temp_path = os.path.join(temp_dir, temp_filename)
|
154 |
pil_image.save(temp_path)
|
155 |
+
|
156 |
# Object detection
|
157 |
result = model_instance.detect(temp_path)
|
158 |
+
|
159 |
if result is None:
|
160 |
return None, "Detection failed. Please try again with a different image.", {}
|
161 |
+
|
162 |
# Calculate stats
|
163 |
stats = EvaluationMetrics.calculate_basic_stats(result)
|
164 |
+
|
165 |
# Add space calculation
|
166 |
spatial_metrics = EvaluationMetrics.calculate_distance_metrics(result)
|
167 |
stats["spatial_metrics"] = spatial_metrics
|
168 |
+
|
169 |
+
# Add lighting information
|
170 |
+
stats["lighting_conditions"] = lighting_info
|
171 |
+
|
172 |
# Apply filter if specified
|
173 |
if filter_classes and len(filter_classes) > 0:
|
174 |
# Get classes, boxes, confidence
|
175 |
classes = result.boxes.cls.cpu().numpy().astype(int)
|
176 |
confs = result.boxes.conf.cpu().numpy()
|
177 |
boxes = result.boxes.xyxy.cpu().numpy()
|
178 |
+
|
179 |
mask = np.zeros_like(classes, dtype=bool)
|
180 |
for cls_id in filter_classes:
|
181 |
mask = np.logical_or(mask, classes == cls_id)
|
182 |
+
|
183 |
filtered_stats = {
|
184 |
"total_objects": int(np.sum(mask)),
|
185 |
"class_statistics": {},
|
186 |
"average_confidence": float(np.mean(confs[mask])) if np.any(mask) else 0,
|
187 |
+
"spatial_metrics": stats["spatial_metrics"],
|
188 |
+
"lighting_conditions": lighting_info
|
189 |
}
|
190 |
+
|
191 |
# Update stats
|
192 |
names = result.names
|
193 |
for cls, conf in zip(classes[mask], confs[mask]):
|
|
|
197 |
"count": 0,
|
198 |
"average_confidence": 0
|
199 |
}
|
200 |
+
|
201 |
filtered_stats["class_statistics"][cls_name]["count"] += 1
|
202 |
filtered_stats["class_statistics"][cls_name]["average_confidence"] = conf
|
203 |
+
|
204 |
stats = filtered_stats
|
205 |
+
|
206 |
viz_data = EvaluationMetrics.generate_visualization_data(
|
207 |
result,
|
208 |
self.color_mapper.get_all_colors()
|
209 |
)
|
210 |
+
|
211 |
result_image = VisualizationHelper.visualize_detection(
|
212 |
temp_path, result, color_mapper=self.color_mapper, figsize=(12, 12), return_pil=True, filter_classes=filter_classes
|
213 |
)
|
214 |
+
|
215 |
result_text = EvaluationMetrics.format_detection_summary(viz_data)
|
216 |
+
|
217 |
+
if result is not None:
|
218 |
+
# Perform scene analysis with lighting info
|
219 |
+
scene_analysis = self.analyze_scene(result, lighting_info)
|
220 |
+
|
221 |
+
# Add scene analysis to stats
|
222 |
+
stats["scene_analysis"] = scene_analysis
|
223 |
+
|
224 |
return result_image, result_text, stats
|
225 |
+
|
226 |
except Exception as e:
|
227 |
error_message = f"Error Occurs: {str(e)}"
|
228 |
import traceback
|
229 |
traceback.print_exc()
|
230 |
print(error_message)
|
231 |
return None, error_message, {}
|
232 |
+
|
233 |
finally:
|
234 |
if temp_path and os.path.exists(temp_path):
|
235 |
try:
|
236 |
os.remove(temp_path)
|
237 |
except Exception as e:
|
238 |
print(f"Cannot delete temp files {temp_path}: {str(e)}")
|
239 |
+
|
240 |
+
|
241 |
def format_result_text(self, stats: Dict) -> str:
|
242 |
"""
|
243 |
Format detection statistics into readable text with improved spacing
|
244 |
+
|
245 |
Args:
|
246 |
stats: Dictionary containing detection statistics
|
247 |
+
|
248 |
Returns:
|
249 |
Formatted text summary
|
250 |
"""
|
251 |
if not stats or "total_objects" not in stats:
|
252 |
return "No objects detected."
|
253 |
+
|
254 |
# 減少不必要的空行
|
255 |
lines = [
|
256 |
f"Detected {stats['total_objects']} objects.",
|
257 |
f"Average confidence: {stats.get('average_confidence', 0):.2f}",
|
258 |
"Objects by class:"
|
259 |
]
|
260 |
+
|
261 |
if "class_statistics" in stats and stats["class_statistics"]:
|
262 |
# 按計數排序類別
|
263 |
sorted_classes = sorted(
|
|
|
265 |
key=lambda x: x[1]["count"],
|
266 |
reverse=True
|
267 |
)
|
268 |
+
|
269 |
for cls_name, cls_stats in sorted_classes:
|
270 |
count = cls_stats["count"]
|
271 |
conf = cls_stats.get("average_confidence", 0)
|
272 |
+
|
273 |
item_text = "item" if count == 1 else "items"
|
274 |
lines.append(f"• {cls_name}: {count} {item_text} (avg conf: {conf:.2f})")
|
275 |
else:
|
276 |
lines.append("No class information available.")
|
277 |
+
|
278 |
# 添加空間信息
|
279 |
if "spatial_metrics" in stats and "spatial_distribution" in stats["spatial_metrics"]:
|
280 |
lines.append("Object Distribution:")
|
281 |
+
|
282 |
dist = stats["spatial_metrics"]["spatial_distribution"]
|
283 |
x_mean = dist.get("x_mean", 0)
|
284 |
y_mean = dist.get("y_mean", 0)
|
285 |
+
|
286 |
# 描述物體的大致位置
|
287 |
if x_mean < 0.33:
|
288 |
h_pos = "on the left side"
|
|
|
290 |
h_pos = "in the center"
|
291 |
else:
|
292 |
h_pos = "on the right side"
|
293 |
+
|
294 |
if y_mean < 0.33:
|
295 |
v_pos = "in the upper part"
|
296 |
elif y_mean < 0.67:
|
297 |
v_pos = "in the middle"
|
298 |
else:
|
299 |
v_pos = "in the lower part"
|
300 |
+
|
301 |
lines.append(f"• Most objects appear {h_pos} {v_pos} of the image")
|
302 |
+
|
303 |
return "\n".join(lines)
|
304 |
+
|
305 |
def format_json_for_display(self, stats: Dict) -> Dict:
|
306 |
"""
|
307 |
Format statistics JSON for better display
|
308 |
+
|
309 |
Args:
|
310 |
stats: Raw statistics dictionary
|
311 |
+
|
312 |
Returns:
|
313 |
Formatted statistics structure for display
|
314 |
"""
|
315 |
# Create a cleaner copy of the stats for display
|
316 |
display_stats = {}
|
317 |
+
|
318 |
# Add summary section
|
319 |
display_stats["summary"] = {
|
320 |
"total_objects": stats.get("total_objects", 0),
|
321 |
"average_confidence": round(stats.get("average_confidence", 0), 3)
|
322 |
}
|
323 |
+
|
324 |
# Add class statistics in a more organized way
|
325 |
if "class_statistics" in stats and stats["class_statistics"]:
|
326 |
# Sort classes by count (descending)
|
|
|
329 |
key=lambda x: x[1].get("count", 0),
|
330 |
reverse=True
|
331 |
)
|
332 |
+
|
333 |
class_stats = {}
|
334 |
for cls_name, cls_data in sorted_classes:
|
335 |
class_stats[cls_name] = {
|
336 |
"count": cls_data.get("count", 0),
|
337 |
"average_confidence": round(cls_data.get("average_confidence", 0), 3)
|
338 |
}
|
339 |
+
|
340 |
display_stats["detected_objects"] = class_stats
|
341 |
+
|
342 |
# Simplify spatial metrics
|
343 |
if "spatial_metrics" in stats:
|
344 |
spatial = stats["spatial_metrics"]
|
345 |
+
|
346 |
# Simplify spatial distribution
|
347 |
if "spatial_distribution" in spatial:
|
348 |
dist = spatial["spatial_distribution"]
|
|
|
354 |
"y_std": round(dist.get("y_std", 0), 3)
|
355 |
}
|
356 |
}
|
357 |
+
|
358 |
# Add simplified size information
|
359 |
if "size_distribution" in spatial:
|
360 |
size = spatial["size_distribution"]
|
|
|
363 |
"min_area": round(size.get("min_area", 0), 3),
|
364 |
"max_area": round(size.get("max_area", 0), 3)
|
365 |
}
|
366 |
+
|
367 |
return display_stats
|
368 |
+
|
369 |
def prepare_visualization_data(self, stats: Dict, available_classes: Dict[int, str]) -> Dict:
|
370 |
"""
|
371 |
Prepare data for visualization based on detection statistics
|
372 |
+
|
373 |
Args:
|
374 |
stats: Detection statistics
|
375 |
available_classes: Dictionary of available class IDs and names
|
376 |
+
|
377 |
Returns:
|
378 |
Visualization data dictionary
|
379 |
"""
|
380 |
if not stats or "class_statistics" not in stats or not stats["class_statistics"]:
|
381 |
return {"error": "No detection data available"}
|
382 |
+
|
383 |
# Prepare visualization data
|
384 |
viz_data = {
|
385 |
"total_objects": stats.get("total_objects", 0),
|
386 |
"average_confidence": stats.get("average_confidence", 0),
|
387 |
"class_data": []
|
388 |
}
|
389 |
+
|
390 |
# Class data
|
391 |
for cls_name, cls_stats in stats.get("class_statistics", {}).items():
|
392 |
# Search class ID
|
|
|
395 |
if name == cls_name:
|
396 |
class_id = id
|
397 |
break
|
398 |
+
|
399 |
cls_data = {
|
400 |
"name": cls_name,
|
401 |
"class_id": class_id,
|
|
|
403 |
"average_confidence": cls_stats.get("average_confidence", 0),
|
404 |
"color": self.color_mapper.get_color(class_id if class_id >= 0 else cls_name)
|
405 |
}
|
406 |
+
|
407 |
viz_data["class_data"].append(cls_data)
|
408 |
+
|
409 |
# Descending order
|
410 |
viz_data["class_data"].sort(key=lambda x: x["count"], reverse=True)
|
411 |
+
|
412 |
return viz_data
|
lighting_analyzer.py
ADDED
@@ -0,0 +1,811 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import cv2
|
3 |
+
from typing import Dict, Any, Optional
|
4 |
+
|
5 |
+
class LightingAnalyzer:
|
6 |
+
"""
|
7 |
+
分析圖像的光照條件,提供增強的室內or室外判斷和光照類型分類,並專注於光照分析。
|
8 |
+
"""
|
9 |
+
|
10 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
11 |
+
"""
|
12 |
+
初始化光照分析器。
|
13 |
+
|
14 |
+
Args:
|
15 |
+
config: 可選的配置字典,用於自定義分析參數
|
16 |
+
"""
|
17 |
+
self.config = config or self._get_default_config()
|
18 |
+
|
19 |
+
def analyze(self, image):
|
20 |
+
"""
|
21 |
+
分析圖像的光照條件。
|
22 |
+
|
23 |
+
主要分析入口點,計算基本特徵,判斷室內/室外,確定光照條件。
|
24 |
+
|
25 |
+
Args:
|
26 |
+
image: 輸入圖像 (numpy array 或 PIL Image)
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
Dict: 包含光照分析結果的字典
|
30 |
+
"""
|
31 |
+
try:
|
32 |
+
# 轉換圖像格式
|
33 |
+
if not isinstance(image, np.ndarray):
|
34 |
+
image_np = np.array(image)
|
35 |
+
else:
|
36 |
+
image_np = image.copy()
|
37 |
+
|
38 |
+
# 確保 RGB 格式
|
39 |
+
if image_np.shape[2] == 3 and isinstance(image_np, np.ndarray):
|
40 |
+
image_rgb = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
|
41 |
+
else:
|
42 |
+
image_rgb = image_np
|
43 |
+
|
44 |
+
# 計算基本特徵
|
45 |
+
features = self._compute_basic_features(image_rgb)
|
46 |
+
|
47 |
+
# 分析室內or室外
|
48 |
+
indoor_result = self._analyze_indoor_outdoor(features)
|
49 |
+
is_indoor = indoor_result["is_indoor"]
|
50 |
+
indoor_probability = indoor_result["indoor_probability"]
|
51 |
+
|
52 |
+
# 確定光照條件
|
53 |
+
lighting_conditions = self._determine_lighting_conditions(features, is_indoor)
|
54 |
+
|
55 |
+
# 整合結果
|
56 |
+
result = {
|
57 |
+
"time_of_day": lighting_conditions["time_of_day"],
|
58 |
+
"confidence": float(lighting_conditions["confidence"]),
|
59 |
+
"is_indoor": is_indoor,
|
60 |
+
"indoor_probability": float(indoor_probability),
|
61 |
+
"brightness": {
|
62 |
+
"average": float(features["avg_brightness"]),
|
63 |
+
"std_dev": float(features["brightness_std"]),
|
64 |
+
"dark_ratio": float(features["dark_pixel_ratio"])
|
65 |
+
},
|
66 |
+
"color_info": {
|
67 |
+
"blue_ratio": float(features["blue_ratio"]),
|
68 |
+
"yellow_orange_ratio": float(features["yellow_orange_ratio"]),
|
69 |
+
"gray_ratio": float(features["gray_ratio"]),
|
70 |
+
"avg_saturation": float(features["avg_saturation"]),
|
71 |
+
"sky_brightness": float(features["sky_brightness"]),
|
72 |
+
"color_atmosphere": features["color_atmosphere"],
|
73 |
+
"warm_ratio": float(features["warm_ratio"]),
|
74 |
+
"cool_ratio": float(features["cool_ratio"])
|
75 |
+
}
|
76 |
+
}
|
77 |
+
|
78 |
+
# 添加診斷信息
|
79 |
+
if self.config["include_diagnostics"]:
|
80 |
+
result["diagnostics"] = {
|
81 |
+
"feature_contributions": indoor_result.get("feature_contributions", {}),
|
82 |
+
"lighting_diagnostics": lighting_conditions.get("diagnostics", {})
|
83 |
+
}
|
84 |
+
|
85 |
+
return result
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
print(f"Error in lighting analysis: {str(e)}")
|
89 |
+
import traceback
|
90 |
+
traceback.print_exc()
|
91 |
+
return {
|
92 |
+
"time_of_day": "unknown",
|
93 |
+
"confidence": 0,
|
94 |
+
"error": str(e)
|
95 |
+
}
|
96 |
+
|
97 |
+
def _compute_basic_features(self, image_rgb):
|
98 |
+
"""
|
99 |
+
計算圖像的基本光照特徵(徹底優化版本)。
|
100 |
+
|
101 |
+
Args:
|
102 |
+
image_rgb: RGB 格式的圖像 (numpy array)
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
Dict: 包含計算出的特徵值
|
106 |
+
"""
|
107 |
+
# 獲取圖像尺寸
|
108 |
+
height, width = image_rgb.shape[:2]
|
109 |
+
|
110 |
+
# 根據圖像大小自適應縮放因子
|
111 |
+
base_scale = 4
|
112 |
+
scale_factor = base_scale + min(8, max(0, int((height * width) / (1000 * 1000))))
|
113 |
+
|
114 |
+
# 創建縮小的圖像以加速處理
|
115 |
+
small_rgb = cv2.resize(image_rgb, (width//scale_factor, height//scale_factor))
|
116 |
+
|
117 |
+
# 一次性轉換所有顏色空間,避免重複計算
|
118 |
+
hsv_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2HSV)
|
119 |
+
gray_img = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
|
120 |
+
small_gray = cv2.resize(gray_img, (width//scale_factor, height//scale_factor))
|
121 |
+
|
122 |
+
# 分離HSV通道
|
123 |
+
h_channel = hsv_img[:,:,0]
|
124 |
+
s_channel = hsv_img[:,:,1]
|
125 |
+
v_channel = hsv_img[:,:,2]
|
126 |
+
|
127 |
+
# 基本亮度特徵
|
128 |
+
avg_brightness = np.mean(v_channel)
|
129 |
+
brightness_std = np.std(v_channel)
|
130 |
+
dark_pixel_ratio = np.sum(v_channel < 50) / (height * width)
|
131 |
+
|
132 |
+
# 顏色特徵
|
133 |
+
yellow_orange_mask = ((h_channel >= 15) & (h_channel <= 40))
|
134 |
+
yellow_orange_ratio = np.sum(yellow_orange_mask) / (height * width)
|
135 |
+
|
136 |
+
blue_mask = ((h_channel >= 90) & (h_channel <= 130))
|
137 |
+
blue_ratio = np.sum(blue_mask) / (height * width)
|
138 |
+
|
139 |
+
# 特別檢查圖像上部區域,尋找藍天特徵
|
140 |
+
upper_region_h = h_channel[:height//4, :]
|
141 |
+
upper_region_s = s_channel[:height//4, :]
|
142 |
+
upper_region_v = v_channel[:height//4, :]
|
143 |
+
|
144 |
+
# 藍天通常具有高飽和度的藍色
|
145 |
+
sky_blue_mask = ((upper_region_h >= 90) & (upper_region_h <= 130) &
|
146 |
+
(upper_region_s > 70) & (upper_region_v > 150))
|
147 |
+
sky_blue_ratio = np.sum(sky_blue_mask) / max(1, upper_region_h.size)
|
148 |
+
|
149 |
+
gray_mask = (s_channel < 50) & (v_channel > 100)
|
150 |
+
gray_ratio = np.sum(gray_mask) / (height * width)
|
151 |
+
|
152 |
+
avg_saturation = np.mean(s_channel)
|
153 |
+
|
154 |
+
# 天空亮度
|
155 |
+
upper_half = v_channel[:height//2, :]
|
156 |
+
sky_brightness = np.mean(upper_half)
|
157 |
+
|
158 |
+
# 色調分析
|
159 |
+
warm_colors = ((h_channel >= 0) & (h_channel <= 60)) | (h_channel >= 300)
|
160 |
+
warm_ratio = np.sum(warm_colors) / (height * width)
|
161 |
+
|
162 |
+
cool_colors = (h_channel >= 180) & (h_channel <= 270)
|
163 |
+
cool_ratio = np.sum(cool_colors) / (height * width)
|
164 |
+
|
165 |
+
# 確定色彩氛圍
|
166 |
+
if warm_ratio > 0.4:
|
167 |
+
color_atmosphere = "warm"
|
168 |
+
elif cool_ratio > 0.4:
|
169 |
+
color_atmosphere = "cool"
|
170 |
+
else:
|
171 |
+
color_atmosphere = "neutral"
|
172 |
+
|
173 |
+
# 只在縮小的圖像上計算梯度,大幅提高效能
|
174 |
+
gx = cv2.Sobel(small_gray, cv2.CV_32F, 1, 0, ksize=3)
|
175 |
+
gy = cv2.Sobel(small_gray, cv2.CV_32F, 0, 1, ksize=3)
|
176 |
+
|
177 |
+
vertical_strength = np.mean(np.abs(gy))
|
178 |
+
horizontal_strength = np.mean(np.abs(gx))
|
179 |
+
gradient_ratio = vertical_strength / max(horizontal_strength, 1e-5)
|
180 |
+
|
181 |
+
# -- 亮度均勻性 --
|
182 |
+
brightness_uniformity = 1 - min(1, brightness_std / max(avg_brightness, 1e-5))
|
183 |
+
|
184 |
+
# -- 高效的天花板分析 --
|
185 |
+
# 使用更大的下採樣率分析頂部區域
|
186 |
+
top_scale = scale_factor * 2 # 更積極的下採樣
|
187 |
+
top_region = v_channel[:height//4:top_scale, ::top_scale]
|
188 |
+
top_region_std = np.std(top_region)
|
189 |
+
ceiling_uniformity = 1.0 - min(1.0, top_region_std / max(np.mean(top_region), 1e-5))
|
190 |
+
|
191 |
+
# 使用更簡單的方法檢測上部水平線
|
192 |
+
top_gradients = np.abs(gy[:small_gray.shape[0]//4, :])
|
193 |
+
horizontal_lines_strength = np.mean(top_gradients)
|
194 |
+
# 標準化
|
195 |
+
horizontal_line_ratio = min(1.0, horizontal_lines_strength / 40)
|
196 |
+
|
197 |
+
# 極簡的亮點檢測
|
198 |
+
sampled_v = v_channel[::scale_factor*2, ::scale_factor*2]
|
199 |
+
light_threshold = min(220, avg_brightness + 2*brightness_std)
|
200 |
+
is_bright = sampled_v > light_threshold
|
201 |
+
bright_spot_count = np.sum(is_bright)
|
202 |
+
|
203 |
+
# 圓形光源分析的簡化替代方法
|
204 |
+
circular_light_score = 0
|
205 |
+
indoor_light_score = 0
|
206 |
+
light_distribution_uniformity = 0.5
|
207 |
+
|
208 |
+
# 只有當檢測到亮點,且不是大量亮點時(可能是室外光反射)才進行光源分析
|
209 |
+
if 1 < bright_spot_count < 20:
|
210 |
+
# 簡單統計亮點分布
|
211 |
+
bright_y, bright_x = np.where(is_bright)
|
212 |
+
if len(bright_y) > 1:
|
213 |
+
# 檢查亮點是否成組出現 - 室內照明常見模式
|
214 |
+
mean_x = np.mean(bright_x)
|
215 |
+
mean_y = np.mean(bright_y)
|
216 |
+
dist_from_center = np.sqrt((bright_x - mean_x)**2 + (bright_y - mean_y)**2)
|
217 |
+
|
218 |
+
# 如果亮點分布較集中,可能是燈具
|
219 |
+
if np.std(dist_from_center) < np.mean(dist_from_center):
|
220 |
+
circular_light_score = min(3, len(bright_y) // 2)
|
221 |
+
light_distribution_uniformity = 0.7
|
222 |
+
|
223 |
+
# 評估亮點是否位於上部區域,常見於室內頂燈
|
224 |
+
if np.mean(bright_y) < sampled_v.shape[0] / 2:
|
225 |
+
indoor_light_score = 0.6
|
226 |
+
else:
|
227 |
+
indoor_light_score = 0.3
|
228 |
+
|
229 |
+
# 使用邊緣區域梯度來快速估計邊界
|
230 |
+
edge_scale = scale_factor * 2
|
231 |
+
|
232 |
+
# 只採樣圖像邊緣部分進行分析
|
233 |
+
left_edge = small_gray[:, :small_gray.shape[1]//6]
|
234 |
+
right_edge = small_gray[:, 5*small_gray.shape[1]//6:]
|
235 |
+
top_edge = small_gray[:small_gray.shape[0]//6, :]
|
236 |
+
|
237 |
+
# 計算每個邊緣區域的梯度強度
|
238 |
+
left_gradient = np.mean(np.abs(cv2.Sobel(left_edge, cv2.CV_32F, 1, 0, ksize=3)))
|
239 |
+
right_gradient = np.mean(np.abs(cv2.Sobel(right_edge, cv2.CV_32F, 1, 0, ksize=3)))
|
240 |
+
top_gradient = np.mean(np.abs(cv2.Sobel(top_edge, cv2.CV_32F, 0, 1, ksize=3)))
|
241 |
+
|
242 |
+
# 標準化
|
243 |
+
left_edge_density = min(1.0, left_gradient / 50.0)
|
244 |
+
right_edge_density = min(1.0, right_gradient / 50.0)
|
245 |
+
top_edge_density = min(1.0, top_gradient / 50.0)
|
246 |
+
|
247 |
+
# 封閉環境通常在圖像邊緣有較強的梯度
|
248 |
+
boundary_edge_score = (left_edge_density + right_edge_density + top_edge_density) / 3
|
249 |
+
|
250 |
+
# 簡單估計���體邊緣密度
|
251 |
+
edges_density = min(1.0, (np.mean(np.abs(gx)) + np.mean(np.abs(gy))) / 100.0)
|
252 |
+
|
253 |
+
street_line_score = 0
|
254 |
+
|
255 |
+
# 檢查下半部分是否有強烈的垂直線條
|
256 |
+
bottom_half = small_gray[small_gray.shape[0]//2:, :]
|
257 |
+
bottom_vert_gradient = cv2.Sobel(bottom_half, cv2.CV_32F, 0, 1, ksize=3)
|
258 |
+
strong_vert_lines = np.abs(bottom_vert_gradient) > 50
|
259 |
+
if np.sum(strong_vert_lines) > (bottom_half.size * 0.05): # 如果超過5%的像素是強垂直線
|
260 |
+
street_line_score = 0.7
|
261 |
+
|
262 |
+
# 整合所有特徵
|
263 |
+
features = {
|
264 |
+
# 基本亮度和顏色特徵
|
265 |
+
"avg_brightness": avg_brightness,
|
266 |
+
"brightness_std": brightness_std,
|
267 |
+
"dark_pixel_ratio": dark_pixel_ratio,
|
268 |
+
"yellow_orange_ratio": yellow_orange_ratio,
|
269 |
+
"blue_ratio": blue_ratio,
|
270 |
+
"sky_blue_ratio": sky_blue_ratio,
|
271 |
+
"gray_ratio": gray_ratio,
|
272 |
+
"avg_saturation": avg_saturation,
|
273 |
+
"sky_brightness": sky_brightness,
|
274 |
+
"color_atmosphere": color_atmosphere,
|
275 |
+
"warm_ratio": warm_ratio,
|
276 |
+
"cool_ratio": cool_ratio,
|
277 |
+
|
278 |
+
# 結構特徵
|
279 |
+
"gradient_ratio": gradient_ratio,
|
280 |
+
"brightness_uniformity": brightness_uniformity,
|
281 |
+
"bright_spot_count": bright_spot_count,
|
282 |
+
"vertical_strength": vertical_strength,
|
283 |
+
"horizontal_strength": horizontal_strength,
|
284 |
+
|
285 |
+
# 室內/室外判斷特徵
|
286 |
+
"ceiling_uniformity": ceiling_uniformity,
|
287 |
+
"horizontal_line_ratio": horizontal_line_ratio,
|
288 |
+
"indoor_light_score": indoor_light_score,
|
289 |
+
"circular_light_count": circular_light_score,
|
290 |
+
"light_distribution_uniformity": light_distribution_uniformity,
|
291 |
+
"boundary_edge_score": boundary_edge_score,
|
292 |
+
"top_region_std": top_region_std,
|
293 |
+
"edges_density": edges_density,
|
294 |
+
|
295 |
+
# 新增:室外特定特徵
|
296 |
+
"street_line_score": street_line_score
|
297 |
+
}
|
298 |
+
|
299 |
+
return features
|
300 |
+
|
301 |
+
def _analyze_indoor_outdoor(self, features):
|
302 |
+
"""
|
303 |
+
使用多特徵融合進行室內/室外判斷
|
304 |
+
|
305 |
+
Args:
|
306 |
+
features: 特徵字典
|
307 |
+
|
308 |
+
Returns:
|
309 |
+
Dict: 室內/室外判斷結果
|
310 |
+
"""
|
311 |
+
# 獲取配置中的特徵權重
|
312 |
+
weights = self.config["indoor_outdoor_weights"]
|
313 |
+
|
314 |
+
# 初始概率值 - 開始時中性評估
|
315 |
+
indoor_score = 0
|
316 |
+
feature_contributions = {}
|
317 |
+
diagnostics = {}
|
318 |
+
|
319 |
+
# 1. 藍色區域(天空)特徵 - 藍色區域多通常表示室外
|
320 |
+
if features.get("blue_ratio", 0) > 0.2:
|
321 |
+
# 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
|
322 |
+
if (features.get("ceiling_uniformity", 0) > 0.5 or
|
323 |
+
features.get("boundary_edge_score", 0) > 0.3 or
|
324 |
+
features.get("indoor_light_score", 0) > 0.2 or
|
325 |
+
features.get("bright_spot_count", 0) > 0):
|
326 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 8
|
327 |
+
else:
|
328 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
|
329 |
+
else:
|
330 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 15
|
331 |
+
|
332 |
+
indoor_score += blue_score
|
333 |
+
feature_contributions["blue_ratio"] = blue_score
|
334 |
+
|
335 |
+
# 判斷視角 - 如果上部有藍天而上下亮度差異大,可能是仰視室外建築
|
336 |
+
if (features.get("sky_blue_ratio", 0) > 0.01 and
|
337 |
+
features["sky_brightness"] > features["avg_brightness"] * 1.1):
|
338 |
+
viewpoint_outdoor_score = -1.8 # 強烈的室外指標
|
339 |
+
indoor_score += viewpoint_outdoor_score
|
340 |
+
feature_contributions["outdoor_viewpoint"] = viewpoint_outdoor_score
|
341 |
+
|
342 |
+
# 2. 亮度均勻性特徵 - 室內通常光照更均勻
|
343 |
+
uniformity_score = weights["brightness_uniformity"] * features["brightness_uniformity"]
|
344 |
+
indoor_score += uniformity_score
|
345 |
+
feature_contributions["brightness_uniformity"] = uniformity_score
|
346 |
+
|
347 |
+
# 3. 天花板特徵 - 強化天花板檢測的權重
|
348 |
+
ceiling_contribution = 0
|
349 |
+
if "ceiling_uniformity" in features:
|
350 |
+
ceiling_uniformity = features["ceiling_uniformity"]
|
351 |
+
horizontal_line_ratio = features.get("horizontal_line_ratio", 0)
|
352 |
+
|
353 |
+
# 增強天花板檢測的影響
|
354 |
+
if ceiling_uniformity > 0.5:
|
355 |
+
ceiling_weight = 3
|
356 |
+
ceiling_contribution = weights.get("ceiling_features", 1.5) * ceiling_weight
|
357 |
+
if horizontal_line_ratio > 0.2: # 如果有水平線條,進一步增強
|
358 |
+
ceiling_contribution *= 1.5
|
359 |
+
elif ceiling_uniformity > 0.4:
|
360 |
+
ceiling_contribution = weights.get("ceiling_features", 1.5) * 1.2
|
361 |
+
|
362 |
+
indoor_score += ceiling_contribution
|
363 |
+
feature_contributions["ceiling_features"] = ceiling_contribution
|
364 |
+
|
365 |
+
# 4. 強化吊燈的檢測
|
366 |
+
light_contribution = 0
|
367 |
+
if "indoor_light_score" in features:
|
368 |
+
indoor_light_score = features["indoor_light_score"]
|
369 |
+
circular_light_count = features.get("circular_light_count", 0)
|
370 |
+
|
371 |
+
# 加強對特定類型光源的檢測
|
372 |
+
if circular_light_count >= 1: # 即便只有一個圓形光源也很可能是室內
|
373 |
+
light_contribution = weights.get("light_features", 1.2) * 2.0
|
374 |
+
elif indoor_light_score > 0.3:
|
375 |
+
light_contribution = weights.get("light_features", 1.2) * 1.0
|
376 |
+
|
377 |
+
indoor_score += light_contribution
|
378 |
+
feature_contributions["light_features"] = light_contribution
|
379 |
+
|
380 |
+
# 5. 環境封閉度特徵
|
381 |
+
boundary_contribution = 0
|
382 |
+
if "boundary_edge_score" in features:
|
383 |
+
boundary_edge_score = features["boundary_edge_score"]
|
384 |
+
edges_density = features.get("edges_density", 0)
|
385 |
+
|
386 |
+
# 高邊界評分暗示封閉環境(室內)
|
387 |
+
if boundary_edge_score > 0.3:
|
388 |
+
boundary_contribution = weights.get("boundary_features", 1.2) * 2
|
389 |
+
elif boundary_edge_score > 0.2:
|
390 |
+
boundary_contribution = weights.get("boundary_features", 1.2) * 1.2
|
391 |
+
|
392 |
+
indoor_score += boundary_contribution
|
393 |
+
feature_contributions["boundary_features"] = boundary_contribution
|
394 |
+
|
395 |
+
if (features.get("edges_density", 0) > 0.2 and
|
396 |
+
features.get("bright_spot_count", 0) > 5 and
|
397 |
+
features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.5):
|
398 |
+
# 商業街道特徵:高邊緣密度 + 多亮點 + 強垂直特徵
|
399 |
+
street_feature_score = -weights.get("street_features", 1.2) * 1.5
|
400 |
+
indoor_score += street_feature_score
|
401 |
+
feature_contributions["street_features"] = street_feature_score
|
402 |
+
|
403 |
+
# 添加對亞洲商業街道的專門檢測
|
404 |
+
if (features.get("edges_density", 0) > 0.25 and # 高邊緣密度
|
405 |
+
features.get("vertical_strength", 0) > features.get("horizontal_strength", 0) * 1.8 and # 更強的垂直結構
|
406 |
+
features.get("brightness_uniformity", 0) < 0.6): # 較低的亮度均勻性(招牌、燈光等造成)
|
407 |
+
asian_street_score = -2.2 # 非常強的室外代表性特徵
|
408 |
+
indoor_score += asian_street_score
|
409 |
+
feature_contributions["asian_commercial_street"] = asian_street_score
|
410 |
+
|
411 |
+
|
412 |
+
# 6. 垂直/水平梯度比率
|
413 |
+
gradient_contribution = 0
|
414 |
+
if features["gradient_ratio"] > 2.0:
|
415 |
+
combined_uniformity = (features["brightness_uniformity"] +
|
416 |
+
features.get("ceiling_uniformity", 0)) / 2
|
417 |
+
|
418 |
+
if combined_uniformity > 0.5:
|
419 |
+
gradient_contribution = weights["gradient_ratio"] * 0.7
|
420 |
+
else:
|
421 |
+
gradient_contribution = -weights["gradient_ratio"] * 0.3
|
422 |
+
|
423 |
+
indoor_score += gradient_contribution
|
424 |
+
feature_contributions["gradient_ratio"] = gradient_contribution
|
425 |
+
|
426 |
+
# 7. 亮點檢測(光源)
|
427 |
+
bright_spot_contribution = 0
|
428 |
+
bright_spot_count = features["bright_spot_count"]
|
429 |
+
circular_light_count = features.get("circular_light_count", 0)
|
430 |
+
|
431 |
+
# 調整亮點分析邏輯
|
432 |
+
if circular_light_count >= 1: # 即使只有一個圓形光源
|
433 |
+
bright_spot_contribution = weights["bright_spots"] * 1.5
|
434 |
+
elif bright_spot_count < 5: # 適當放寬閾值
|
435 |
+
bright_spot_contribution = weights["bright_spots"] * 0.5
|
436 |
+
elif bright_spot_count > 15: # 大量亮點比較有可能為室外
|
437 |
+
bright_spot_contribution = -weights["bright_spots"] * 0.4
|
438 |
+
|
439 |
+
indoor_score += bright_spot_contribution
|
440 |
+
feature_contributions["bright_spots"] = bright_spot_contribution
|
441 |
+
|
442 |
+
# 8. 色調分析
|
443 |
+
yellow_contribution = 0
|
444 |
+
if features["avg_brightness"] < 150 and features["yellow_orange_ratio"] > 0.15:
|
445 |
+
if features.get("indoor_light_score", 0) > 0.2:
|
446 |
+
yellow_contribution = weights["color_tone"] * 0.8
|
447 |
+
else:
|
448 |
+
yellow_contribution = weights["color_tone"] * 0.5
|
449 |
+
|
450 |
+
indoor_score += yellow_contribution
|
451 |
+
feature_contributions["yellow_tone"] = yellow_contribution
|
452 |
+
|
453 |
+
if features.get("blue_ratio", 0) > 0.7:
|
454 |
+
# 檢查是否有室內指標,如果有明顯的室內特徵,則減少藍色的負面影響
|
455 |
+
if (features.get("ceiling_uniformity", 0) > 0.6 or
|
456 |
+
features.get("boundary_edge_score", 0) > 0.3 or
|
457 |
+
features.get("indoor_light_score", 0) > 0):
|
458 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 10
|
459 |
+
else:
|
460 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
|
461 |
+
else:
|
462 |
+
blue_score = -weights["blue_ratio"] * features["blue_ratio"] * 18
|
463 |
+
# 9. 上半部與下半部亮度對比
|
464 |
+
sky_contribution = 0
|
465 |
+
if features["sky_brightness"] > features["avg_brightness"] * 1.3:
|
466 |
+
if features["blue_ratio"] > 0.15:
|
467 |
+
sky_contribution = -weights["sky_brightness"] * 0.9
|
468 |
+
else:
|
469 |
+
sky_contribution = -weights["sky_brightness"] * 0.6
|
470 |
+
|
471 |
+
indoor_score += sky_contribution
|
472 |
+
feature_contributions["sky_brightness"] = sky_contribution
|
473 |
+
|
474 |
+
# 加入額外的餐廳特徵檢測邏輯
|
475 |
+
dining_feature_contribution = 0
|
476 |
+
|
477 |
+
# 檢測中央懸掛式燈具,有懸掛燈代表有天花板,就代表是室內
|
478 |
+
if circular_light_count >= 1 and features.get("light_distribution_uniformity", 0) > 0.4:
|
479 |
+
dining_feature_contribution = 1.5
|
480 |
+
indoor_score += dining_feature_contribution
|
481 |
+
feature_contributions["dining_features"] = dining_feature_contribution
|
482 |
+
|
483 |
+
# 10. 增強的藍天的檢測,即便是小面積的藍天也是很強的室外指標
|
484 |
+
sky_contribution = 0
|
485 |
+
if "sky_blue_ratio" in features:
|
486 |
+
# 只有當藍色區域集中在上部且亮度高時,才認為是藍天
|
487 |
+
if features["sky_blue_ratio"] > 0.01 and features["sky_brightness"] > features.get("avg_brightness", 0) * 1.2:
|
488 |
+
sky_outdoor_score = -2.5 * features["sky_blue_ratio"] * weights.get("blue_ratio", 1.2)
|
489 |
+
indoor_score += sky_outdoor_score
|
490 |
+
feature_contributions["sky_blue_detection"] = sky_outdoor_score
|
491 |
+
|
492 |
+
asian_street_indicators = 0
|
493 |
+
|
494 |
+
# 1: 高垂直結構強度
|
495 |
+
vertical_ratio = features.get("vertical_strength", 0) / max(features.get("horizontal_strength", 1e-5), 1e-5)
|
496 |
+
if vertical_ratio > 1.8:
|
497 |
+
asian_street_indicators += 1
|
498 |
+
|
499 |
+
# 2: 高邊緣密度 + 路面標記特徵
|
500 |
+
if features.get("edges_density", 0) > 0.25 and features.get("street_line_score", 0) > 0.2:
|
501 |
+
asian_street_indicators += 2
|
502 |
+
|
503 |
+
# 3: 多個亮點 + 亮度不均勻
|
504 |
+
if features.get("bright_spot_count", 0) > 5 and features.get("brightness_uniformity", 0) < 0.6:
|
505 |
+
asian_street_indicators += 1
|
506 |
+
|
507 |
+
# 4: 藍色區域小(天空被高樓遮擋)但亮度高
|
508 |
+
if features.get("blue_ratio", 0) < 0.1 and features.get("sky_brightness", 0) > features.get("avg_brightness", 0) * 1.1:
|
509 |
+
asian_street_indicators += 1
|
510 |
+
|
511 |
+
# 如果滿足至少 3 個指標,調整權重變成偏向室外的判斷
|
512 |
+
if asian_street_indicators >= 3:
|
513 |
+
# 記錄檢測到的模式
|
514 |
+
feature_contributions["asian_street_pattern"] = -2.5
|
515 |
+
indoor_score += -2.5 # 明顯向室外傾斜
|
516 |
+
|
517 |
+
# 降低室內指標的權重
|
518 |
+
if "boundary_features" in feature_contributions:
|
519 |
+
adjusted_contribution = feature_contributions["boundary_features"] * 0.4
|
520 |
+
indoor_score -= (feature_contributions["boundary_features"] - adjusted_contribution)
|
521 |
+
feature_contributions["boundary_features"] = adjusted_contribution
|
522 |
+
|
523 |
+
if "ceiling_features" in feature_contributions:
|
524 |
+
adjusted_contribution = feature_contributions["ceiling_features"] * 0.3
|
525 |
+
indoor_score -= (feature_contributions["ceiling_features"] - adjusted_contribution)
|
526 |
+
feature_contributions["ceiling_features"] = adjusted_contribution
|
527 |
+
|
528 |
+
# 添加信息到診斷數據
|
529 |
+
diagnostics["asian_street_detected"] = True
|
530 |
+
diagnostics["asian_street_indicators"] = asian_street_indicators
|
531 |
+
|
532 |
+
bedroom_indicators = 0
|
533 |
+
|
534 |
+
# 1: 窗戶和牆壁形成的直角
|
535 |
+
if features.get("brightness_uniformity", 0) > 0.6 and features.get("boundary_edge_score", 0) > 0.3:
|
536 |
+
bedroom_indicators += 1.5 # 增加權重
|
537 |
+
|
538 |
+
# 2: 天花板和光源
|
539 |
+
if features.get("ceiling_uniformity", 0) > 0.5 and features.get("bright_spot_count", 0) > 0:
|
540 |
+
bedroom_indicators += 2.5
|
541 |
+
|
542 |
+
# 3: 良好對比度的牆壁顏色,適合臥房還有客廳
|
543 |
+
if features.get("brightness_uniformity", 0) > 0.6 and features.get("avg_saturation", 0) < 100:
|
544 |
+
bedroom_indicators += 1.5
|
545 |
+
|
546 |
+
# 特殊的檢測 4: 檢測窗戶
|
547 |
+
if features.get("boundary_edge_score", 0) > 0.25 and features.get("brightness_std", 0) > 40:
|
548 |
+
bedroom_indicators += 1.5
|
549 |
+
|
550 |
+
# 如果滿足足夠的家居指標,提高多點室內判斷分數
|
551 |
+
if bedroom_indicators >= 3:
|
552 |
+
# 增加家居環境評分
|
553 |
+
home_env_score = 3
|
554 |
+
indoor_score += home_env_score
|
555 |
+
feature_contributions["home_environment_pattern"] = home_env_score
|
556 |
+
elif bedroom_indicators >= 2:
|
557 |
+
# 適度增加家居環境評分
|
558 |
+
home_env_score = 2
|
559 |
+
indoor_score += home_env_score
|
560 |
+
feature_contributions["home_environment_pattern"] = home_env_score
|
561 |
+
|
562 |
+
# 根據總分轉換為���率(使用sigmoid函數)
|
563 |
+
indoor_probability = 1 / (1 + np.exp(-indoor_score * 0.22))
|
564 |
+
|
565 |
+
# 判斷結果
|
566 |
+
is_indoor = indoor_probability > 0.5
|
567 |
+
|
568 |
+
return {
|
569 |
+
"is_indoor": is_indoor,
|
570 |
+
"indoor_probability": indoor_probability,
|
571 |
+
"indoor_score": indoor_score,
|
572 |
+
"feature_contributions": feature_contributions,
|
573 |
+
"diagnostics": diagnostics
|
574 |
+
}
|
575 |
+
|
576 |
+
def _determine_lighting_conditions(self, features, is_indoor):
|
577 |
+
"""
|
578 |
+
基於特徵和室內/室外判斷確定光照條件。
|
579 |
+
|
580 |
+
Args:
|
581 |
+
features: 特徵字典
|
582 |
+
is_indoor: 是否是室內環境
|
583 |
+
|
584 |
+
Returns:
|
585 |
+
Dict: 光照條件分析結果
|
586 |
+
"""
|
587 |
+
# 初始化
|
588 |
+
time_of_day = "unknown"
|
589 |
+
confidence = 0.5
|
590 |
+
diagnostics = {}
|
591 |
+
|
592 |
+
avg_brightness = features["avg_brightness"]
|
593 |
+
dark_pixel_ratio = features["dark_pixel_ratio"]
|
594 |
+
yellow_orange_ratio = features["yellow_orange_ratio"]
|
595 |
+
blue_ratio = features["blue_ratio"]
|
596 |
+
gray_ratio = features["gray_ratio"]
|
597 |
+
|
598 |
+
# 基於室內/室外分別判斷
|
599 |
+
if is_indoor:
|
600 |
+
# 計算室內住宅自然光指標
|
601 |
+
natural_window_light = 0
|
602 |
+
|
603 |
+
# 檢查窗戶特徵和光線特性
|
604 |
+
if (features.get("blue_ratio", 0) > 0.1 and
|
605 |
+
features.get("sky_brightness", 0) > avg_brightness * 1.1):
|
606 |
+
natural_window_light += 1
|
607 |
+
|
608 |
+
# 檢查均勻柔和的光線分布
|
609 |
+
if (features.get("brightness_uniformity", 0) > 0.65 and
|
610 |
+
features.get("brightness_std", 0) < 70):
|
611 |
+
natural_window_light += 1
|
612 |
+
|
613 |
+
# 檢查暖色調比例
|
614 |
+
if features.get("warm_ratio", 0) > 0.2:
|
615 |
+
natural_window_light += 1
|
616 |
+
|
617 |
+
# 家居環境指標
|
618 |
+
home_env_score = features.get("home_environment_pattern", 0)
|
619 |
+
if home_env_score > 1.5:
|
620 |
+
natural_window_light += 1
|
621 |
+
|
622 |
+
# 1. 室內明亮環境,可能有窗戶自然光
|
623 |
+
if avg_brightness > 130:
|
624 |
+
# 檢測自然光住宅空間 - 新增類型!
|
625 |
+
if natural_window_light >= 2 and home_env_score > 1.5:
|
626 |
+
time_of_day = "indoor_residential_natural" # 家裡的自然光類型
|
627 |
+
confidence = 0.8
|
628 |
+
diagnostics["reason"] = "Bright residential space with natural window lighting"
|
629 |
+
# 檢查窗戶特徵 - 如果有明亮的窗戶且色調為藍
|
630 |
+
elif features.get("blue_ratio", 0) > 0.1 and features.get("sky_brightness", 0) > 150:
|
631 |
+
time_of_day = "indoor_bright"
|
632 |
+
confidence = 0.8
|
633 |
+
diagnostics["reason"] = "Bright indoor scene with window light"
|
634 |
+
else:
|
635 |
+
time_of_day = "indoor_bright"
|
636 |
+
confidence = 0.75
|
637 |
+
diagnostics["reason"] = "High brightness in indoor environment"
|
638 |
+
# 2. 室內中等亮度環境
|
639 |
+
elif avg_brightness > 100:
|
640 |
+
time_of_day = "indoor_moderate"
|
641 |
+
confidence = 0.7
|
642 |
+
diagnostics["reason"] = "Moderate brightness in indoor environment"
|
643 |
+
# 3. 室內低光照環境
|
644 |
+
else:
|
645 |
+
time_of_day = "indoor_dim"
|
646 |
+
confidence = 0.65 + dark_pixel_ratio / 3
|
647 |
+
diagnostics["reason"] = "Low brightness in indoor environment"
|
648 |
+
|
649 |
+
# 1. 檢測設計師風格住宅,可以偵測到比較多種類的狀況
|
650 |
+
designer_residential_score = 0
|
651 |
+
# 檢測特色燈具
|
652 |
+
if (features.get("circular_light_count", 0) > 0 or features.get("bright_spot_count", 0) > 2):
|
653 |
+
designer_residential_score += 1
|
654 |
+
# 檢測高品質均勻照明
|
655 |
+
if features.get("brightness_uniformity", 0) > 0.7:
|
656 |
+
designer_residential_score += 1
|
657 |
+
# 檢測溫暖色調
|
658 |
+
if features.get("warm_ratio", 0) > 0.3:
|
659 |
+
designer_residential_score += 1
|
660 |
+
# 檢測家居環境特徵
|
661 |
+
if home_env_score > 1.5:
|
662 |
+
designer_residential_score += 1
|
663 |
+
|
664 |
+
if designer_residential_score >= 3 and home_env_score > 1.5:
|
665 |
+
time_of_day = "indoor_designer_residential"
|
666 |
+
confidence = 0.85
|
667 |
+
diagnostics["special_case"] = "Designer residential lighting with decorative elements"
|
668 |
+
|
669 |
+
# 2. 檢測餐廳/酒吧場景
|
670 |
+
elif avg_brightness < 150 and yellow_orange_ratio > 0.2:
|
671 |
+
if features["warm_ratio"] > 0.4:
|
672 |
+
time_of_day = "indoor_restaurant"
|
673 |
+
confidence = 0.65 + yellow_orange_ratio / 4
|
674 |
+
diagnostics["special_case"] = "Warm, yellow-orange lighting suggests restaurant/bar setting"
|
675 |
+
|
676 |
+
# 3. 檢測商業照明空間
|
677 |
+
elif avg_brightness > 120 and features["bright_spot_count"] > 4:
|
678 |
+
# 增加商業照明判別的精確度
|
679 |
+
commercial_score = 0
|
680 |
+
# 多個亮點
|
681 |
+
commercial_score += min(1.0, features["bright_spot_count"] * 0.05)
|
682 |
+
# 不太可能是住宅的指標
|
683 |
+
if features.get("home_environment_pattern", 0) < 1.5:
|
684 |
+
commercial_score += 0.5
|
685 |
+
# 整體照明結構化布局
|
686 |
+
if features.get("light_distribution_uniformity", 0) > 0.6:
|
687 |
+
commercial_score += 0.5
|
688 |
+
|
689 |
+
if commercial_score > 0.6 and designer_residential_score < 3:
|
690 |
+
time_of_day = "indoor_commercial"
|
691 |
+
confidence = 0.7 + commercial_score / 5
|
692 |
+
diagnostics["special_case"] = "Multiple structured light sources suggest commercial lighting"
|
693 |
+
else:
|
694 |
+
# 室外場景判斷保持不變
|
695 |
+
if avg_brightness < 90: # 降低夜間判斷的亮度閾值
|
696 |
+
# 檢測是否有車燈/街燈
|
697 |
+
has_lights = features["bright_spot_count"] > 3
|
698 |
+
|
699 |
+
if has_lights:
|
700 |
+
time_of_day = "night"
|
701 |
+
confidence = 0.8 + dark_pixel_ratio / 5
|
702 |
+
diagnostics["reason"] = "Low brightness with light sources detected"
|
703 |
+
|
704 |
+
# 檢查是否是霓虹燈場景
|
705 |
+
if yellow_orange_ratio > 0.15 and features["bright_spot_count"] > 5:
|
706 |
+
time_of_day = "neon_night"
|
707 |
+
confidence = 0.75 + yellow_orange_ratio / 3
|
708 |
+
diagnostics["special_case"] = "Multiple colorful light sources suggest neon lighting"
|
709 |
+
else:
|
710 |
+
time_of_day = "night"
|
711 |
+
confidence = 0.7 + dark_pixel_ratio / 3
|
712 |
+
diagnostics["reason"] = "Low brightness outdoor scene"
|
713 |
+
elif avg_brightness < 130 and yellow_orange_ratio > 0.2:
|
714 |
+
time_of_day = "sunset/sunrise"
|
715 |
+
confidence = 0.7 + yellow_orange_ratio / 3
|
716 |
+
diagnostics["reason"] = "Moderate brightness with yellow-orange tones"
|
717 |
+
elif avg_brightness > 150 and blue_ratio > 0.15:
|
718 |
+
time_of_day = "day_clear"
|
719 |
+
confidence = 0.7 + blue_ratio / 3
|
720 |
+
diagnostics["reason"] = "High brightness with blue tones (likely sky)"
|
721 |
+
elif avg_brightness > 130:
|
722 |
+
time_of_day = "day_cloudy"
|
723 |
+
confidence = 0.7 + gray_ratio / 3
|
724 |
+
diagnostics["reason"] = "Good brightness with higher gray tones"
|
725 |
+
else:
|
726 |
+
# 默認判斷
|
727 |
+
if yellow_orange_ratio > gray_ratio:
|
728 |
+
time_of_day = "sunset/sunrise"
|
729 |
+
confidence = 0.6 + yellow_orange_ratio / 3
|
730 |
+
diagnostics["reason"] = "Yellow-orange tones dominant"
|
731 |
+
else:
|
732 |
+
time_of_day = "day_cloudy"
|
733 |
+
confidence = 0.6 + gray_ratio / 3
|
734 |
+
diagnostics["reason"] = "Gray tones dominant"
|
735 |
+
|
736 |
+
# 檢查是否是特殊室外場景(如體育場)
|
737 |
+
if avg_brightness > 120 and features["brightness_uniformity"] > 0.8:
|
738 |
+
# 高亮度且非常均勻的光照可能是體育場燈光
|
739 |
+
time_of_day = "stadium_lighting"
|
740 |
+
confidence = 0.7
|
741 |
+
diagnostics["special_case"] = "Uniform bright lighting suggests stadium/sports lighting"
|
742 |
+
|
743 |
+
# 檢查是否是混合光照(如室內/室外過渡區)
|
744 |
+
if 100 < avg_brightness < 150 and 0.1 < blue_ratio < 0.2:
|
745 |
+
if features["gradient_ratio"] > 1.5:
|
746 |
+
time_of_day = "mixed_lighting"
|
747 |
+
confidence = 0.65
|
748 |
+
diagnostics["special_case"] = "Features suggest indoor-outdoor transition area"
|
749 |
+
|
750 |
+
# 確保信心值在 0-1 範圍內
|
751 |
+
confidence = min(0.95, max(0.5, confidence))
|
752 |
+
|
753 |
+
if time_of_day in ["indoor_residential_natural", "indoor_designer_residential"] and hasattr(self, "config"):
|
754 |
+
# 確保 LIGHTING_CONDITIONS 中有這些新類型的描述
|
755 |
+
if time_of_day == "indoor_residential_natural":
|
756 |
+
lightingType = {
|
757 |
+
"template_modifiers": {
|
758 |
+
"indoor_residential_natural": "naturally-lit residential"
|
759 |
+
},
|
760 |
+
"time_descriptions": {
|
761 |
+
"indoor_residential_natural": {
|
762 |
+
"general": "The scene is captured in a residential space with ample natural light from windows.",
|
763 |
+
"bright": "The residential space is brightly lit with natural daylight streaming through windows.",
|
764 |
+
"medium": "The home environment has good natural lighting providing a warm, inviting atmosphere.",
|
765 |
+
"dim": "The living space has soft natural light filtering through windows or openings."
|
766 |
+
}
|
767 |
+
}
|
768 |
+
}
|
769 |
+
elif time_of_day == "indoor_designer_residential":
|
770 |
+
lightingType = {
|
771 |
+
"template_modifiers": {
|
772 |
+
"indoor_designer_residential": "designer-lit residential"
|
773 |
+
},
|
774 |
+
"time_descriptions": {
|
775 |
+
"indoor_designer_residential": {
|
776 |
+
"general": "The scene is captured in a residential space with carefully designed lighting elements.",
|
777 |
+
"bright": "The home features professionally designed lighting with decorative fixtures creating a bright atmosphere.",
|
778 |
+
"medium": "The residential interior showcases curated lighting design balancing form and function.",
|
779 |
+
"dim": "The living space has thoughtfully placed designer lighting creating an intimate ambiance."
|
780 |
+
}
|
781 |
+
}
|
782 |
+
}
|
783 |
+
|
784 |
+
return {
|
785 |
+
"time_of_day": time_of_day,
|
786 |
+
"confidence": confidence,
|
787 |
+
"diagnostics": diagnostics
|
788 |
+
}
|
789 |
+
|
790 |
+
|
791 |
+
def _get_default_config(self):
|
792 |
+
"""
|
793 |
+
返回優化版本的默認配置參數。
|
794 |
+
"""
|
795 |
+
return {
|
796 |
+
"indoor_outdoor_weights": {
|
797 |
+
"blue_ratio": 0.6,
|
798 |
+
"brightness_uniformity": 1.2,
|
799 |
+
"gradient_ratio": 0.7,
|
800 |
+
"bright_spots": 0.8,
|
801 |
+
"color_tone": 0.5,
|
802 |
+
"sky_brightness": 0.9,
|
803 |
+
"brightness_variation": 0.7,
|
804 |
+
"ceiling_features": 1.5,
|
805 |
+
"light_features": 1.1,
|
806 |
+
"boundary_features": 2.8,
|
807 |
+
"street_features": 2.0,
|
808 |
+
"building_features": 1.6
|
809 |
+
},
|
810 |
+
"include_diagnostics": True
|
811 |
+
}
|
lighting_conditions.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
LIGHTING_CONDITIONS = {
|
3 |
+
"time_descriptions": {
|
4 |
+
"day_clear": {
|
5 |
+
"general": "The scene is captured during clear daylight hours with bright natural lighting.",
|
6 |
+
"bright": "The scene is brightly lit with strong, clear daylight.",
|
7 |
+
"medium": "The scene is illuminated with moderate daylight under clear conditions.",
|
8 |
+
"dim": "The scene is captured in soft daylight on a clear day."
|
9 |
+
},
|
10 |
+
"day_cloudy": {
|
11 |
+
"general": "The scene is captured during daytime under overcast conditions.",
|
12 |
+
"bright": "The scene has the diffused bright lighting of an overcast day.",
|
13 |
+
"medium": "The scene has even, soft lighting typical of a cloudy day.",
|
14 |
+
"dim": "The scene has the muted lighting of a heavily overcast day."
|
15 |
+
},
|
16 |
+
"sunset/sunrise": {
|
17 |
+
"general": "The scene is captured during golden hour with warm lighting.",
|
18 |
+
"bright": "The scene is illuminated with bright golden hour light with long shadows.",
|
19 |
+
"medium": "The scene has the warm orange-yellow glow typical of sunset or sunrise.",
|
20 |
+
"dim": "The scene has soft, warm lighting characteristic of early sunrise or late sunset."
|
21 |
+
},
|
22 |
+
"night": {
|
23 |
+
"general": "The scene is captured at night with limited natural lighting.",
|
24 |
+
"bright": "The scene is captured at night but well-lit with artificial lighting.",
|
25 |
+
"medium": "The scene is captured at night with moderate artificial lighting.",
|
26 |
+
"dim": "The scene is captured in low-light night conditions with minimal illumination."
|
27 |
+
},
|
28 |
+
"indoor_bright": {
|
29 |
+
"general": "The scene is captured indoors with ample lighting.",
|
30 |
+
"bright": "The indoor space is brightly lit, possibly with natural light from windows.",
|
31 |
+
"medium": "The indoor space has good lighting conditions.",
|
32 |
+
"dim": "The indoor space has adequate lighting."
|
33 |
+
},
|
34 |
+
"indoor_moderate": {
|
35 |
+
"general": "The scene is captured indoors with moderate lighting.",
|
36 |
+
"bright": "The indoor space has comfortable, moderate lighting.",
|
37 |
+
"medium": "The indoor space has standard interior lighting.",
|
38 |
+
"dim": "The indoor space has somewhat subdued lighting."
|
39 |
+
},
|
40 |
+
"indoor_dim": {
|
41 |
+
"general": "The scene is captured indoors with dim or mood lighting.",
|
42 |
+
"bright": "The indoor space has dim but sufficient lighting.",
|
43 |
+
"medium": "The indoor space has low, atmospheric lighting.",
|
44 |
+
"dim": "The indoor space has very dim, possibly mood-oriented lighting."
|
45 |
+
},
|
46 |
+
"beach_daylight": {
|
47 |
+
"general": "The scene is captured during daytime at a beach with bright natural sunlight.",
|
48 |
+
"bright": "The beach scene is intensely illuminated by direct sunlight.",
|
49 |
+
"medium": "The coastal area has even natural daylight.",
|
50 |
+
"dim": "The beach has softer lighting, possibly from a partially cloudy sky."
|
51 |
+
},
|
52 |
+
"sports_arena": {
|
53 |
+
"general": "The scene is captured in a sports venue with specialized arena lighting.",
|
54 |
+
"bright": "The sports facility is brightly illuminated with powerful overhead lights.",
|
55 |
+
"medium": "The venue has standard sports event lighting providing clear visibility.",
|
56 |
+
"dim": "The sports area has reduced illumination, possibly before or after an event."
|
57 |
+
},
|
58 |
+
"kitchen_working": {
|
59 |
+
"general": "The scene is captured in a professional kitchen with task-oriented lighting.",
|
60 |
+
"bright": "The kitchen is intensely illuminated with clear, functional lighting.",
|
61 |
+
"medium": "The culinary space has standard working lights focused on preparation areas.",
|
62 |
+
"dim": "The kitchen has reduced lighting, possibly during off-peak hours."
|
63 |
+
},
|
64 |
+
"unknown": {
|
65 |
+
"general": "The lighting conditions in this scene are not easily determined."
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"template_modifiers": {
|
69 |
+
"day_clear": "brightly-lit",
|
70 |
+
"day_cloudy": "softly-lit",
|
71 |
+
"sunset/sunrise": "warmly-lit",
|
72 |
+
"night": "night-time",
|
73 |
+
"indoor_bright": "well-lit indoor",
|
74 |
+
"indoor_moderate": "indoor",
|
75 |
+
"indoor_dim": "dimly-lit indoor",
|
76 |
+
"indoor_commercial": "retail-lit",
|
77 |
+
"indoor_restaurant": "atmospherically-lit",
|
78 |
+
"neon_night": "neon-illuminated",
|
79 |
+
"stadium_lighting": "flood-lit",
|
80 |
+
"mixed_lighting": "transitionally-lit",
|
81 |
+
"beach_lighting": "sun-drenched",
|
82 |
+
"sports_venue_lighting": "arena-lit",
|
83 |
+
"professional_kitchen_lighting": "kitchen-task lit",
|
84 |
+
"unknown": ""
|
85 |
+
},
|
86 |
+
"activity_modifiers": {
|
87 |
+
"day_clear": ["active", "lively", "busy"],
|
88 |
+
"day_cloudy": ["calm", "relaxed", "casual"],
|
89 |
+
"sunset/sunrise": ["peaceful", "transitional", "atmospheric"],
|
90 |
+
"night": ["quiet", "subdued", "nocturnal"],
|
91 |
+
"indoor_bright": ["focused", "productive", "engaged"],
|
92 |
+
"indoor_moderate": ["comfortable", "social", "casual"],
|
93 |
+
"indoor_dim": ["intimate", "relaxed", "private"],
|
94 |
+
"indoor_commercial": ["shopping", "browsing", "consumer-oriented"],
|
95 |
+
"indoor_restaurant": ["dining", "social", "culinary"],
|
96 |
+
"neon_night": ["vibrant", "energetic", "night-life"],
|
97 |
+
"stadium_lighting": ["event-focused", "spectator-oriented", "performance-based"],
|
98 |
+
"mixed_lighting": ["transitional", "adaptable", "variable"],
|
99 |
+
"unknown": []
|
100 |
+
},
|
101 |
+
"indoor_commercial": {
|
102 |
+
"general": "The scene is captured inside a commercial setting with retail-optimized lighting.",
|
103 |
+
"bright": "The space is brightly illuminated with commercial display lighting to highlight merchandise.",
|
104 |
+
"medium": "The commercial interior has standard retail lighting that balances visibility and ambiance.",
|
105 |
+
"dim": "The commercial space has subdued lighting creating an upscale or intimate shopping atmosphere."
|
106 |
+
},
|
107 |
+
"indoor_restaurant": {
|
108 |
+
"general": "The scene is captured inside a restaurant with characteristic dining lighting.",
|
109 |
+
"bright": "The restaurant is well-lit with clear illumination emphasizing food presentation.",
|
110 |
+
"medium": "The dining space has moderate lighting striking a balance between functionality and ambiance.",
|
111 |
+
"dim": "The restaurant features soft, low lighting creating an intimate dining atmosphere."
|
112 |
+
},
|
113 |
+
"neon_night": {
|
114 |
+
"general": "The scene is captured at night with colorful neon lighting typical of entertainment districts.",
|
115 |
+
"bright": "The night scene is illuminated by vibrant neon signs creating a lively, colorful atmosphere.",
|
116 |
+
"medium": "The evening setting features moderate neon lighting creating a characteristic urban nightlife scene.",
|
117 |
+
"dim": "The night area has subtle neon accents against the darkness, creating a moody urban atmosphere."
|
118 |
+
},
|
119 |
+
"stadium_lighting": {
|
120 |
+
"general": "The scene is captured under powerful stadium lights designed for spectator events.",
|
121 |
+
"bright": "The venue is intensely illuminated by stadium floodlights creating daylight-like conditions.",
|
122 |
+
"medium": "The sports facility has standard event lighting providing clear visibility across the venue.",
|
123 |
+
"dim": "The stadium has reduced illumination typical of pre-event or post-event conditions."
|
124 |
+
},
|
125 |
+
"mixed_lighting": {
|
126 |
+
"general": "The scene features a mix of indoor and outdoor lighting creating transitional illumination.",
|
127 |
+
"bright": "The space blends bright natural and artificial light sources across indoor-outdoor boundaries.",
|
128 |
+
"medium": "The area combines moderate indoor lighting with outdoor illumination in a balanced way.",
|
129 |
+
"dim": "The transition space features subtle lighting gradients between indoor and outdoor zones."
|
130 |
+
}
|
131 |
+
}
|
object_categories.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OBJECT_CATEGORIES = {
|
2 |
+
"furniture": [56, 57, 58, 59, 60, 61],
|
3 |
+
"electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
|
4 |
+
"kitchen_items": [39, 40, 41, 42, 43, 44, 45],
|
5 |
+
"food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
|
6 |
+
"vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
|
7 |
+
"personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
|
8 |
+
}
|
object_template_fillers.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
OBJECT_TEMPLATE_FILLERS = {
|
3 |
+
"furniture": ["designer chairs", "wooden dining table", "stylish seating", "upholstered armchairs", "elegant dining furniture"],
|
4 |
+
"design_elements": ["art pieces", "decorative wreaths", "statement lighting", "seasonal decorations", "sophisticated decor"],
|
5 |
+
"lighting": ["pendant lights", "decorative fixtures", "geometric lighting", "modern chandeliers", "ambient illumination"],
|
6 |
+
"table_setup": ["elegantly set table", "tabletop decorations", "seasonal centerpieces", "formal place settings", "floral arrangements"],
|
7 |
+
"seating": ["upholstered chairs", "accent armchairs", "mixed seating styles", "designer dining chairs", "comfortable dining seats"],
|
8 |
+
"table_description": ["solid wood table", "designer dining table", "expansive dining surface", "artisanal table", "statement dining table"],
|
9 |
+
|
10 |
+
"storefront_features": ["multi-story shops", "illuminated signs", "merchandise displays", "compact storefronts", "vertical retail spaces"],
|
11 |
+
"pedestrian_flow": ["people walking", "shoppers", "pedestrians", "locals and tourists", "urban foot traffic"],
|
12 |
+
"asian_elements": ["Asian language signage", "decorative lanterns", "local storefronts", "character-based text", "regional design elements"],
|
13 |
+
"cultural_elements": ["red lanterns", "local typography", "distinctive architecture", "cultural symbols", "traditional decorations"],
|
14 |
+
"signage": ["bright store signs", "multilingual text", "vertical signboards", "neon displays", "electronic advertisements"],
|
15 |
+
"street_activities": ["shopping", "commuting", "socializing", "vendor transactions", "urban navigation"],
|
16 |
+
|
17 |
+
"buildings": ["high-rise office buildings", "corporate towers", "skyscrapers", "financial institutions", "commercial headquarters"],
|
18 |
+
"traffic_elements": ["vehicle lights", "trams/street cars", "lane markers", "traffic signals", "urban transit"],
|
19 |
+
"skyscrapers": ["glass and steel buildings", "tall structures", "modern architecture", "office towers", "urban high-rises"],
|
20 |
+
"road_features": ["wide avenues", "tram tracks", "traffic lanes", "median dividers", "urban throughways"],
|
21 |
+
"architectural_elements": ["contemporary buildings", "urban design", "varied architectural styles", "corporate architecture", "city planning features"],
|
22 |
+
"city_landmarks": ["distant bridge", "skyline features", "iconic structures", "urban monuments", "signature buildings"],
|
23 |
+
|
24 |
+
"crossing_pattern": ["zebra crosswalks", "pedestrian walkways", "crosswalk markings", "intersection design", "safety stripes"],
|
25 |
+
"pedestrian_density": ["groups of people", "commuters", "diverse pedestrians", "urban crowds", "varying foot traffic"],
|
26 |
+
"pedestrian_behavior": ["walking in different directions", "crossing together", "waiting for signals", "navigating intersections", "following traffic rules"],
|
27 |
+
"traffic_pattern": ["four-way intersection", "crossroad", "junction", "multi-directional traffic", "regulated crossing"],
|
28 |
+
"pedestrian_flow": ["people crossing", "directional movement", "coordinated crossing", "timed pedestrian traffic", "intersection navigation"],
|
29 |
+
|
30 |
+
"transit_vehicles": ["buses", "trams", "trains", "taxis", "shuttles"],
|
31 |
+
"passenger_activity": ["boarding", "waiting", "exiting vehicles", "checking schedules", "navigating stations"],
|
32 |
+
"transportation_modes": ["public transit", "private vehicles", "ride services", "light rail", "bus systems"],
|
33 |
+
"passenger_needs": ["waiting areas", "information displays", "ticketing services", "transit connections", "seating"],
|
34 |
+
"transit_infrastructure": ["stations", "platforms", "boarding areas", "transit lanes", "signaling systems"],
|
35 |
+
"passenger_movement": ["transfers", "entrances and exits", "queueing", "platform access", "terminal navigation"],
|
36 |
+
|
37 |
+
"retail_elements": ["storefronts", "display windows", "shopping bags", "merchandise", "retail signage"],
|
38 |
+
"shopping_activity": ["browsing", "carrying purchases", "window shopping", "social shopping", "consumer activities"],
|
39 |
+
"store_types": ["boutiques", "brand stores", "local shops", "chain retailers", "specialty stores"],
|
40 |
+
"walkway_features": ["pedestrian paths", "shopping promenades", "retail corridors", "commercial walkways", "shopping streets"],
|
41 |
+
"commercial_signage": ["brand logos", "sale announcements", "store names", "advertising displays", "digital signage"],
|
42 |
+
"consumer_behavior": ["shopping in groups", "individual browsing", "carrying bags", "examining products", "moving between stores"],
|
43 |
+
|
44 |
+
"beach_equipment": ["beach umbrellas", "surfboards", "beach towels", "sun protection", "recreational equipment"],
|
45 |
+
"water_activities": ["water sports", "surfing", "beach recreation", "sun bathing", "coastal leisure"],
|
46 |
+
"sports_equipment": ["game balls", "professional equipment", "athletic gear", "sports apparatus", "competition items"],
|
47 |
+
"competitive_activities": ["team sports", "athletic contests", "competitive games", "sporting events", "professional matches"],
|
48 |
+
"kitchen_equipment": ["professional appliances", "cooking stations", "preparation surfaces", "culinary tools", "industrial equipment"],
|
49 |
+
"food_preparation": ["meal production", "culinary operations", "food service preparation", "commercial cooking", "kitchen workflow"],
|
50 |
+
|
51 |
+
"crossing_pattern": ["grid-like pedestrian crossings", "multi-directional crosswalks", "cross-shaped intersection design", "perpendicular crossing lanes", "zebra-striped crosswalks viewed from above"],
|
52 |
+
"pedestrian_pattern": ["scattered distribution of people", "organized flow of pedestrians", "clustered gatherings", "radial movement patterns", "linear procession of individuals"],
|
53 |
+
"commercial_layout": ["parallel shopping streets", "interconnected shopping blocks", "radial marketplace design", "grid-like retail arrangement", "meandering commercial pathways"],
|
54 |
+
"movement_pattern": ["circular crowd motion", "directional pedestrian flow", "scattered individual movement", "converging foot traffic", "diverging pedestrian patterns"],
|
55 |
+
|
56 |
+
"stall_elements": ["food vendors with steaming woks", "trinket sellers with colorful displays", "lantern-lit stalls", "bamboo-framed shops", "canvas-covered market stands"],
|
57 |
+
"asian_elements": ["hanging red lanterns", "character-based signage", "ornate temple decorations", "traditional paper decorations", "stylized gateway arches"],
|
58 |
+
"cultural_lighting": ["paper lantern illumination", "neon character signs", "strung festival lights", "hanging light chains", "colorful shop front lighting"],
|
59 |
+
"architectural_elements": ["tiered pagoda roofs", "ornate dragon sculptures", "stone guardian statues", "intricately carved railings", "traditional wooden beams"],
|
60 |
+
"cultural_symbols": ["dharma wheels", "lotus motifs", "yin-yang symbols", "zodiac animal representations", "traditional calligraphy"],
|
61 |
+
"architectural_style": ["Baroque facades", "Gothic spires", "Renaissance colonnades", "Neoclassical pediments", "Medieval archways"],
|
62 |
+
"european_features": ["cobblestone paving", "ornate fountains", "bronze statuary", "wrought iron lampposts", "cafe terraces"],
|
63 |
+
|
64 |
+
"lighting_effects": ["streetlamp pools of light", "neon sign glow", "illuminated window squares", "headlight streams", "traffic signal flashes"],
|
65 |
+
"illuminated_elements": ["lit storefront windows", "glowing traffic signals", "illuminated advertising", "headlight-lit streets", "backlit silhouettes"],
|
66 |
+
"neon_elements": ["colorful shop signs", "animated light displays", "illuminated brand logos", "glowing storefront outlines", "digital advertising screens"],
|
67 |
+
"illuminated_signage": ["bright LED displays", "glowing brand names", "projected light advertisements", "illuminated menu boards", "digital information screens"],
|
68 |
+
"colorful_lighting": ["multi-colored neon", "warm ambient illumination", "cool blue accent lights", "festive string lighting", "dynamic color-changing displays"],
|
69 |
+
|
70 |
+
"transitional_elements": ["retractable glass walls", "indoor-outdoor bar counters", "terraced seating areas", "threshold planters", "partial canopy coverage"],
|
71 |
+
"indoor_features": ["climate-controlled spaces", "soft seating arrangements", "interior decor accents", "mood lighting fixtures", "sound-dampened areas"],
|
72 |
+
"outdoor_setting": ["sidewalk tables", "patio seating", "garden furniture", "open-air counters", "courtyard arrangements"],
|
73 |
+
"seating_arrangement": ["tiered spectator stands", "premium viewing boxes", "courtside seating", "general admission benches", "stadium chair rows"],
|
74 |
+
"playing_surface": ["marked court boundaries", "manicured field turf", "running tracks", "competition equipment", "sports field markers"],
|
75 |
+
"construction_equipment": ["tower cranes", "excavators", "cement mixers", "scaffolding structures", "construction barriers"],
|
76 |
+
"medical_elements": ["examination furniture", "monitoring equipment", "sanitation stations", "privacy screens", "medical supply carts"],
|
77 |
+
"educational_furniture": ["student desks", "lecture podiums", "laboratory benches", "learning stations", "collaborative workspace tables"]
|
78 |
+
}
|
requirements.txt
CHANGED
@@ -6,3 +6,4 @@ pillow>=9.4.0
|
|
6 |
numpy>=1.23.5
|
7 |
matplotlib>=3.7.0
|
8 |
gradio>=3.32.0
|
|
|
|
6 |
numpy>=1.23.5
|
7 |
matplotlib>=3.7.0
|
8 |
gradio>=3.32.0
|
9 |
+
git+https://github.com/openai/CLIP.git
|
room_02.jpg
ADDED
![]() |
Git LFS Details
|
safety_templates.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SAFETY_TEMPLATES = {
|
2 |
+
"general": "Pay attention to {safety_element}.",
|
3 |
+
"warning": "Be cautious of {hazard} in this environment.",
|
4 |
+
"notice": "Note the presence of {element_of_interest}."
|
5 |
+
}
|
scene_analyzer.py
ADDED
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
from typing import Dict, List, Tuple, Any, Optional
|
4 |
+
|
5 |
+
from spatial_analyzer import SpatialAnalyzer
|
6 |
+
from scene_description import SceneDescriptor
|
7 |
+
from enhance_scene_describer import EnhancedSceneDescriber
|
8 |
+
from clip_analyzer import CLIPAnalyzer
|
9 |
+
from scene_type import SCENE_TYPES
|
10 |
+
from object_categories import OBJECT_CATEGORIES
|
11 |
+
|
12 |
+
class SceneAnalyzer:
|
13 |
+
"""
|
14 |
+
Core class for scene analysis and understanding based on object detection results.
|
15 |
+
Analyzes detected objects, their relationships, and infers the scene type.
|
16 |
+
"""
|
17 |
+
def __init__(self, class_names: Dict[int, str] = None):
|
18 |
+
"""
|
19 |
+
Initialize the scene analyzer with optional class name mappings.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
class_names: Dictionary mapping class IDs to class names (optional)
|
23 |
+
"""
|
24 |
+
self.class_names = class_names
|
25 |
+
|
26 |
+
# 加載場景類型和物體類別
|
27 |
+
self.SCENE_TYPES = SCENE_TYPES
|
28 |
+
self.OBJECT_CATEGORIES = OBJECT_CATEGORIES
|
29 |
+
|
30 |
+
# 初始化其他組件,將數據傳遞給 SceneDescriptor
|
31 |
+
self.spatial_analyzer = SpatialAnalyzer(class_names=class_names, object_categories=self.OBJECT_CATEGORIES)
|
32 |
+
self.descriptor = SceneDescriptor(scene_types=self.SCENE_TYPES, object_categories=self.OBJECT_CATEGORIES)
|
33 |
+
self.scene_describer = EnhancedSceneDescriber(scene_types=self.SCENE_TYPES)
|
34 |
+
|
35 |
+
# 初始化 CLIP 分析器(新增)
|
36 |
+
try:
|
37 |
+
self.clip_analyzer = CLIPAnalyzer()
|
38 |
+
self.use_clip = True
|
39 |
+
except Exception as e:
|
40 |
+
print(f"Warning: Could not initialize CLIP analyzer: {e}")
|
41 |
+
print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
|
42 |
+
self.use_clip = False
|
43 |
+
|
44 |
+
def generate_scene_description(self,
|
45 |
+
scene_type,
|
46 |
+
detected_objects,
|
47 |
+
confidence,
|
48 |
+
lighting_info=None,
|
49 |
+
functional_zones=None):
|
50 |
+
"""
|
51 |
+
生成場景描述。
|
52 |
+
|
53 |
+
Args:
|
54 |
+
scene_type: 識別的場景類型
|
55 |
+
detected_objects: 檢測到的物體列表
|
56 |
+
confidence: 場景分類置信度
|
57 |
+
lighting_info: 照明條件信息(可選)
|
58 |
+
functional_zones: 功能區域信息(可選)
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
str: 生成的場景描述
|
62 |
+
"""
|
63 |
+
return self.scene_describer.generate_description(
|
64 |
+
scene_type,
|
65 |
+
detected_objects,
|
66 |
+
confidence,
|
67 |
+
lighting_info,
|
68 |
+
functional_zones
|
69 |
+
)
|
70 |
+
|
71 |
+
def _generate_scene_description(self, scene_type, detected_objects, confidence, lighting_info=None):
|
72 |
+
"""
|
73 |
+
Use new implement
|
74 |
+
"""
|
75 |
+
# 獲取功能區域信息(如果需要的話)
|
76 |
+
functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, scene_type)
|
77 |
+
|
78 |
+
# 使用增強的場景描述生成器
|
79 |
+
return self.generate_scene_description(
|
80 |
+
scene_type,
|
81 |
+
detected_objects,
|
82 |
+
confidence,
|
83 |
+
lighting_info,
|
84 |
+
functional_zones
|
85 |
+
)
|
86 |
+
|
87 |
+
def _define_image_regions(self):
|
88 |
+
"""Define regions of the image for spatial analysis (3x3 grid)"""
|
89 |
+
self.regions = {
|
90 |
+
"top_left": (0, 0, 1/3, 1/3),
|
91 |
+
"top_center": (1/3, 0, 2/3, 1/3),
|
92 |
+
"top_right": (2/3, 0, 1, 1/3),
|
93 |
+
"middle_left": (0, 1/3, 1/3, 2/3),
|
94 |
+
"middle_center": (1/3, 1/3, 2/3, 2/3),
|
95 |
+
"middle_right": (2/3, 1/3, 1, 2/3),
|
96 |
+
"bottom_left": (0, 2/3, 1/3, 1),
|
97 |
+
"bottom_center": (1/3, 2/3, 2/3, 1),
|
98 |
+
"bottom_right": (2/3, 2/3, 1, 1)
|
99 |
+
}
|
100 |
+
|
101 |
+
|
102 |
+
def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
|
103 |
+
"""
|
104 |
+
Analyze detection results to determine scene type and provide understanding.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
detection_result: Detection result from YOLOv8
|
108 |
+
lighting_info: Optional lighting condition analysis results
|
109 |
+
class_confidence_threshold: Minimum confidence to consider an object
|
110 |
+
scene_confidence_threshold: Minimum confidence to determine a scene
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
Dictionary with scene analysis results
|
114 |
+
"""
|
115 |
+
# If no result or no detections, return empty analysis
|
116 |
+
if detection_result is None or len(detection_result.boxes) == 0:
|
117 |
+
return {
|
118 |
+
"scene_type": "unknown",
|
119 |
+
"confidence": 0,
|
120 |
+
"description": "No objects detected in the image.",
|
121 |
+
"objects_present": [],
|
122 |
+
"object_count": 0,
|
123 |
+
"regions": {},
|
124 |
+
"possible_activities": [],
|
125 |
+
"safety_concerns": [],
|
126 |
+
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
|
127 |
+
}
|
128 |
+
|
129 |
+
# Get class names from detection result if not already set
|
130 |
+
if self.class_names is None:
|
131 |
+
self.class_names = detection_result.names
|
132 |
+
# Also update class names in spatial analyzer
|
133 |
+
self.spatial_analyzer.class_names = self.class_names
|
134 |
+
|
135 |
+
# Extract detected objects with confidence above threshold
|
136 |
+
detected_objects = self.spatial_analyzer._extract_detected_objects(
|
137 |
+
detection_result,
|
138 |
+
confidence_threshold=class_confidence_threshold
|
139 |
+
)
|
140 |
+
|
141 |
+
# No objects above confidence threshold
|
142 |
+
if not detected_objects:
|
143 |
+
return {
|
144 |
+
"scene_type": "unknown",
|
145 |
+
"confidence": 0.0,
|
146 |
+
"description": "No objects with sufficient confidence detected.",
|
147 |
+
"objects_present": [],
|
148 |
+
"object_count": 0,
|
149 |
+
"regions": {},
|
150 |
+
"possible_activities": [],
|
151 |
+
"safety_concerns": [],
|
152 |
+
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0.0}
|
153 |
+
}
|
154 |
+
|
155 |
+
# Analyze object distribution in regions
|
156 |
+
region_analysis = self.spatial_analyzer._analyze_regions(detected_objects)
|
157 |
+
|
158 |
+
# Compute scene type scores based on object detection
|
159 |
+
yolo_scene_scores = self._compute_scene_scores(detected_objects)
|
160 |
+
|
161 |
+
# 使用 CLIP 分析圖像
|
162 |
+
clip_scene_scores = {}
|
163 |
+
clip_analysis = None
|
164 |
+
if self.use_clip:
|
165 |
+
try:
|
166 |
+
# 獲取原始圖像
|
167 |
+
original_image = detection_result.orig_img
|
168 |
+
|
169 |
+
# Use CLIP analyze image
|
170 |
+
clip_analysis = self.clip_analyzer.analyze_image(original_image)
|
171 |
+
|
172 |
+
# get CLIP's score
|
173 |
+
clip_scene_scores = clip_analysis.get("scene_scores", {})
|
174 |
+
|
175 |
+
if "asian_commercial_street" in clip_scene_scores and clip_scene_scores["asian_commercial_street"] > 0.2:
|
176 |
+
# 使用對比提示進一步區分室內/室外
|
177 |
+
comparative_results = self.clip_analyzer.calculate_similarity(
|
178 |
+
original_image,
|
179 |
+
self.clip_analyzer.comparative_prompts["indoor_vs_outdoor"]
|
180 |
+
)
|
181 |
+
|
182 |
+
# 分析對比結果
|
183 |
+
indoor_score = sum(s for p, s in comparative_results.items() if "indoor" in p or "enclosed" in p)
|
184 |
+
outdoor_score = sum(s for p, s in comparative_results.items() if "outdoor" in p or "open-air" in p)
|
185 |
+
|
186 |
+
# 如果 CLIP 認為這是室外場景,且光照分析認為是室內
|
187 |
+
if outdoor_score > indoor_score and lighting_info and lighting_info.get("is_indoor", False):
|
188 |
+
# 修正光照分析結果
|
189 |
+
print(f"CLIP indicates outdoor commercial street (score: {outdoor_score:.2f} vs {indoor_score:.2f}), adjusting lighting analysis")
|
190 |
+
lighting_info["is_indoor"] = False
|
191 |
+
lighting_info["indoor_probability"] = 0.3
|
192 |
+
# 把CLIP 分析結果加到光照診斷
|
193 |
+
if "diagnostics" not in lighting_info:
|
194 |
+
lighting_info["diagnostics"] = {}
|
195 |
+
lighting_info["diagnostics"]["clip_override"] = {
|
196 |
+
"reason": "CLIP detected outdoor commercial street",
|
197 |
+
"outdoor_score": float(outdoor_score),
|
198 |
+
"indoor_score": float(indoor_score)
|
199 |
+
}
|
200 |
+
|
201 |
+
# 如果 CLIP 檢測到了光照條件但沒有提供 lighting_info
|
202 |
+
if not lighting_info and "lighting_condition" in clip_analysis:
|
203 |
+
lighting_type, lighting_conf = clip_analysis["lighting_condition"]
|
204 |
+
lighting_info = {
|
205 |
+
"time_of_day": lighting_type,
|
206 |
+
"confidence": lighting_conf
|
207 |
+
}
|
208 |
+
except Exception as e:
|
209 |
+
print(f"Error in CLIP analysis: {e}")
|
210 |
+
|
211 |
+
# 融合 YOLO 和 CLIP 的場景分數
|
212 |
+
scene_scores = self._fuse_scene_scores(yolo_scene_scores, clip_scene_scores)
|
213 |
+
|
214 |
+
# Determine best matching scene type
|
215 |
+
best_scene, scene_confidence = self._determine_scene_type(scene_scores)
|
216 |
+
|
217 |
+
# Generate possible activities based on scene
|
218 |
+
activities = self.descriptor._infer_possible_activities(best_scene, detected_objects)
|
219 |
+
|
220 |
+
# Identify potential safety concerns
|
221 |
+
safety_concerns = self.descriptor._identify_safety_concerns(detected_objects, best_scene)
|
222 |
+
|
223 |
+
# Calculate functional zones
|
224 |
+
functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, best_scene)
|
225 |
+
|
226 |
+
# Generate scene description
|
227 |
+
scene_description = self.generate_scene_description(
|
228 |
+
best_scene,
|
229 |
+
detected_objects,
|
230 |
+
scene_confidence,
|
231 |
+
lighting_info=lighting_info,
|
232 |
+
functional_zones=functional_zones
|
233 |
+
)
|
234 |
+
|
235 |
+
# Return comprehensive analysis
|
236 |
+
result = {
|
237 |
+
"scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
|
238 |
+
"scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown")
|
239 |
+
if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
|
240 |
+
"confidence": scene_confidence,
|
241 |
+
"description": scene_description,
|
242 |
+
"objects_present": [
|
243 |
+
{"class_id": obj["class_id"],
|
244 |
+
"class_name": obj["class_name"],
|
245 |
+
"confidence": obj["confidence"]}
|
246 |
+
for obj in detected_objects
|
247 |
+
],
|
248 |
+
"object_count": len(detected_objects),
|
249 |
+
"regions": region_analysis,
|
250 |
+
"possible_activities": activities,
|
251 |
+
"safety_concerns": safety_concerns,
|
252 |
+
"functional_zones": functional_zones,
|
253 |
+
"alternative_scenes": self.descriptor._get_alternative_scenes(scene_scores, scene_confidence_threshold, top_k=2),
|
254 |
+
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
|
255 |
+
}
|
256 |
+
|
257 |
+
# 添加 CLIP 特定的結果(新增)
|
258 |
+
if clip_analysis and "error" not in clip_analysis:
|
259 |
+
result["clip_analysis"] = {
|
260 |
+
"top_scene": clip_analysis.get("top_scene", ("unknown", 0.0)),
|
261 |
+
"cultural_analysis": clip_analysis.get("cultural_analysis", {})
|
262 |
+
}
|
263 |
+
|
264 |
+
return result
|
265 |
+
|
266 |
+
def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
|
267 |
+
"""
|
268 |
+
Compute confidence scores for each scene type based on detected objects.
|
269 |
+
|
270 |
+
Args:
|
271 |
+
detected_objects: List of detected objects
|
272 |
+
|
273 |
+
Returns:
|
274 |
+
Dictionary mapping scene types to confidence scores
|
275 |
+
"""
|
276 |
+
scene_scores = {}
|
277 |
+
detected_class_ids = [obj["class_id"] for obj in detected_objects]
|
278 |
+
detected_classes_set = set(detected_class_ids)
|
279 |
+
|
280 |
+
# Count occurrence of each class
|
281 |
+
class_counts = {}
|
282 |
+
for obj in detected_objects:
|
283 |
+
class_id = obj["class_id"]
|
284 |
+
if class_id not in class_counts:
|
285 |
+
class_counts[class_id] = 0
|
286 |
+
class_counts[class_id] += 1
|
287 |
+
|
288 |
+
# Evaluate each scene type
|
289 |
+
for scene_type, scene_def in self.SCENE_TYPES.items():
|
290 |
+
# Count required objects present
|
291 |
+
required_objects = set(scene_def["required_objects"])
|
292 |
+
required_present = required_objects.intersection(detected_classes_set)
|
293 |
+
|
294 |
+
# Count optional objects present
|
295 |
+
optional_objects = set(scene_def["optional_objects"])
|
296 |
+
optional_present = optional_objects.intersection(detected_classes_set)
|
297 |
+
|
298 |
+
# Skip if minimum required objects aren't present
|
299 |
+
if len(required_present) < scene_def["minimum_required"]:
|
300 |
+
scene_scores[scene_type] = 0
|
301 |
+
continue
|
302 |
+
|
303 |
+
# Base score from required objects
|
304 |
+
required_ratio = len(required_present) / max(1, len(required_objects))
|
305 |
+
required_score = required_ratio * 0.7 # 70% of score from required objects
|
306 |
+
|
307 |
+
# Additional score from optional objects
|
308 |
+
optional_ratio = len(optional_present) / max(1, len(optional_objects))
|
309 |
+
optional_score = optional_ratio * 0.3 # 30% of score from optional objects
|
310 |
+
|
311 |
+
# Bonus for having multiple instances of key objects
|
312 |
+
multiple_bonus = 0.0
|
313 |
+
for class_id in required_present:
|
314 |
+
if class_counts.get(class_id, 0) > 1:
|
315 |
+
multiple_bonus += 0.05 # 5% bonus per additional key object type
|
316 |
+
|
317 |
+
# Cap the bonus at 15%
|
318 |
+
multiple_bonus = min(0.15, multiple_bonus)
|
319 |
+
|
320 |
+
# Calculate final score
|
321 |
+
final_score = required_score + optional_score + multiple_bonus
|
322 |
+
|
323 |
+
if "priority" in scene_def:
|
324 |
+
final_score *= scene_def["priority"]
|
325 |
+
|
326 |
+
# Normalize to 0-1 range
|
327 |
+
scene_scores[scene_type] = min(1.0, final_score)
|
328 |
+
|
329 |
+
return scene_scores
|
330 |
+
|
331 |
+
def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
|
332 |
+
"""
|
333 |
+
Determine the most likely scene type based on scores.
|
334 |
+
|
335 |
+
Args:
|
336 |
+
scene_scores: Dictionary mapping scene types to confidence scores
|
337 |
+
|
338 |
+
Returns:
|
339 |
+
Tuple of (best_scene_type, confidence)
|
340 |
+
"""
|
341 |
+
if not scene_scores:
|
342 |
+
return "unknown", 0
|
343 |
+
|
344 |
+
# Find scene with highest score
|
345 |
+
best_scene = max(scene_scores, key=scene_scores.get)
|
346 |
+
best_score = scene_scores[best_scene]
|
347 |
+
|
348 |
+
return best_scene, best_score
|
349 |
+
|
350 |
+
|
351 |
+
def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
|
352 |
+
"""
|
353 |
+
融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
|
354 |
+
|
355 |
+
Args:
|
356 |
+
yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
|
357 |
+
clip_scene_scores: 基於 CLIP 分析的場景分數
|
358 |
+
|
359 |
+
Returns:
|
360 |
+
Dict: 融合後的場景分數
|
361 |
+
"""
|
362 |
+
# 如果沒有 CLIP 分數,直接返回 YOLO 分數
|
363 |
+
if not clip_scene_scores:
|
364 |
+
return yolo_scene_scores
|
365 |
+
|
366 |
+
# 如果沒有 YOLO 分數,直接返回 CLIP 分數
|
367 |
+
if not yolo_scene_scores:
|
368 |
+
return clip_scene_scores
|
369 |
+
|
370 |
+
# 融合分數
|
371 |
+
fused_scores = {}
|
372 |
+
|
373 |
+
# 獲取所有場景類型
|
374 |
+
all_scene_types = set(list(yolo_scene_scores.keys()) + list(clip_scene_scores.keys()))
|
375 |
+
|
376 |
+
for scene_type in all_scene_types:
|
377 |
+
# 獲取兩個模型的分數
|
378 |
+
yolo_score = yolo_scene_scores.get(scene_type, 0.0)
|
379 |
+
clip_score = clip_scene_scores.get(scene_type, 0.0)
|
380 |
+
|
381 |
+
# 設置基本權重
|
382 |
+
yolo_weight = 0.7 # YOLO 提供更詳細的物體資訊
|
383 |
+
clip_weight = 0.3 # CLIP 提供更好的整體場景理解
|
384 |
+
|
385 |
+
# 對特定類型場景調整權重
|
386 |
+
# 文化特定場景或具有特殊布局的場景,CLIP 可能有優勢
|
387 |
+
if any(keyword in scene_type for keyword in ["asian", "cultural", "aerial"]):
|
388 |
+
yolo_weight = 0.3
|
389 |
+
clip_weight = 0.7
|
390 |
+
|
391 |
+
# 對室內家居場景,物體檢測通常更準確
|
392 |
+
elif any(keyword in scene_type for keyword in ["room", "kitchen", "office", "bedroom"]):
|
393 |
+
yolo_weight = 0.8
|
394 |
+
clip_weight = 0.2
|
395 |
+
elif scene_type == "beach_water_recreation":
|
396 |
+
yolo_weight = 0.8 # 衝浪板等特定物品的檢測非常重要
|
397 |
+
clip_weight = 0.2
|
398 |
+
elif scene_type == "sports_venue":
|
399 |
+
yolo_weight = 0.7
|
400 |
+
clip_weight = 0.3
|
401 |
+
elif scene_type == "professional_kitchen":
|
402 |
+
yolo_weight = 0.8 # 廚房用具的檢測非常重要
|
403 |
+
clip_weight = 0.2
|
404 |
+
|
405 |
+
# 計算加權分數
|
406 |
+
fused_scores[scene_type] = (yolo_score * yolo_weight) + (clip_score * clip_weight)
|
407 |
+
|
408 |
+
return fused_scores
|
scene_description.py
ADDED
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from typing import Dict, List, Tuple, Any, Optional
|
4 |
+
|
5 |
+
from scene_type import SCENE_TYPES
|
6 |
+
from scene_detail_templates import SCENE_DETAIL_TEMPLATES
|
7 |
+
from object_template_fillers import OBJECT_TEMPLATE_FILLERS
|
8 |
+
from activity_templates import ACTIVITY_TEMPLATES
|
9 |
+
from safety_templates import SAFETY_TEMPLATES
|
10 |
+
from confifence_templates import CONFIDENCE_TEMPLATES
|
11 |
+
|
12 |
+
class SceneDescriptor:
|
13 |
+
"""
|
14 |
+
Generates natural language descriptions of scenes.
|
15 |
+
Handles scene descriptions, activity inference, and safety concerns identification.
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(self, scene_types=None, object_categories=None):
|
19 |
+
"""
|
20 |
+
Initialize the scene descriptor
|
21 |
+
|
22 |
+
Args:
|
23 |
+
scene_types: Dictionary of scene type definitions
|
24 |
+
"""
|
25 |
+
self.scene_types = scene_types or {}
|
26 |
+
self.SCENE_TYPES = scene_types or {}
|
27 |
+
|
28 |
+
if object_categories:
|
29 |
+
self.OBJECT_CATEGORIES = object_categories
|
30 |
+
else:
|
31 |
+
# 從 JSON 加載或使用默認值
|
32 |
+
self.OBJECT_CATEGORIES = self._load_json_data("object_categories") or {
|
33 |
+
"furniture": [56, 57, 58, 59, 60, 61],
|
34 |
+
"electronics": [62, 63, 64, 65, 66, 67, 68, 69, 70],
|
35 |
+
"kitchen_items": [39, 40, 41, 42, 43, 44, 45],
|
36 |
+
"food": [46, 47, 48, 49, 50, 51, 52, 53, 54, 55],
|
37 |
+
"vehicles": [1, 2, 3, 4, 5, 6, 7, 8],
|
38 |
+
"personal_items": [24, 25, 26, 27, 28, 73, 78, 79]
|
39 |
+
}
|
40 |
+
|
41 |
+
# 加載所有模板數據
|
42 |
+
self._load_templates()
|
43 |
+
|
44 |
+
def _load_templates(self):
|
45 |
+
"""Load all template data from script or fallback to imported defaults"""
|
46 |
+
self.confidence_templates = CONFIDENCE_TEMPLATES
|
47 |
+
self.scene_detail_templates = SCENE_DETAIL_TEMPLATES
|
48 |
+
self.object_template_fillers = OBJECT_TEMPLATE_FILLERS
|
49 |
+
self.safety_templates = SAFETY_TEMPLATES
|
50 |
+
self.activity_templates = ACTIVITY_TEMPLATES
|
51 |
+
|
52 |
+
|
53 |
+
def _initialize_fallback_templates(self):
|
54 |
+
"""Initialize fallback templates when no external data is available"""
|
55 |
+
# 只在無法從文件或導入加載時使用
|
56 |
+
self.confidence_templates = {
|
57 |
+
"high": "{description} {details}",
|
58 |
+
"medium": "This appears to be {description} {details}",
|
59 |
+
"low": "This might be {description}, but the confidence is low. {details}"
|
60 |
+
}
|
61 |
+
|
62 |
+
# 僅提供最基本的模板作為後備
|
63 |
+
self.scene_detail_templates = {
|
64 |
+
"default": ["A space with various objects."]
|
65 |
+
}
|
66 |
+
|
67 |
+
self.object_template_fillers = {
|
68 |
+
"default": ["various items"]
|
69 |
+
}
|
70 |
+
|
71 |
+
self.safety_templates = {
|
72 |
+
"general": "Pay attention to {safety_element}."
|
73 |
+
}
|
74 |
+
|
75 |
+
self.activity_templates = {
|
76 |
+
"default": ["General activity"]
|
77 |
+
}
|
78 |
+
|
79 |
+
def _get_alternative_scenes(self, scene_scores: Dict[str, float],
|
80 |
+
threshold: float, top_k: int = 2) -> List[Dict]:
|
81 |
+
"""
|
82 |
+
Get alternative scene interpretations with their scores.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
scene_scores: Dictionary of scene type scores
|
86 |
+
threshold: Minimum confidence threshold
|
87 |
+
top_k: Number of alternatives to return
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
List of dictionaries with alternative scenes
|
91 |
+
"""
|
92 |
+
# Sort scenes by score in descending order
|
93 |
+
sorted_scenes = sorted(scene_scores.items(), key=lambda x: x[1], reverse=True)
|
94 |
+
|
95 |
+
# Skip the first one (best match) and take the next top_k
|
96 |
+
alternatives = []
|
97 |
+
for scene_type, score in sorted_scenes[1:1+top_k]:
|
98 |
+
if score >= threshold:
|
99 |
+
alternatives.append({
|
100 |
+
"type": scene_type,
|
101 |
+
"name": self.SCENE_TYPES.get(scene_type, {}).get("name", "Unknown"),
|
102 |
+
"confidence": score
|
103 |
+
})
|
104 |
+
|
105 |
+
return alternatives
|
106 |
+
|
107 |
+
|
108 |
+
def _infer_possible_activities(self, scene_type: str, detected_objects: List[Dict]) -> List[str]:
|
109 |
+
"""
|
110 |
+
Infer possible activities based on scene type and detected objects.
|
111 |
+
|
112 |
+
Args:
|
113 |
+
scene_type: Identified scene type
|
114 |
+
detected_objects: List of detected objects
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
List of possible activities
|
118 |
+
"""
|
119 |
+
activities = []
|
120 |
+
|
121 |
+
if scene_type.startswith("aerial_view_"):
|
122 |
+
if scene_type == "aerial_view_intersection":
|
123 |
+
# 使用預定義的十字路口活動
|
124 |
+
activities.extend(self.activity_templates.get("aerial_view_intersection", []))
|
125 |
+
|
126 |
+
# 添加與行人和車輛相關的特定活動
|
127 |
+
pedestrians = [obj for obj in detected_objects if obj["class_id"] == 0]
|
128 |
+
vehicles = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]] # Car, bus, truck
|
129 |
+
|
130 |
+
if pedestrians and vehicles:
|
131 |
+
activities.append("Waiting for an opportunity to cross the street")
|
132 |
+
activities.append("Obeying traffic signals")
|
133 |
+
|
134 |
+
elif scene_type == "aerial_view_commercial_area":
|
135 |
+
activities.extend(self.activity_templates.get("aerial_view_commercial_area", []))
|
136 |
+
|
137 |
+
elif scene_type == "aerial_view_plaza":
|
138 |
+
activities.extend(self.activity_templates.get("aerial_view_plaza", []))
|
139 |
+
|
140 |
+
else:
|
141 |
+
# 處理其他未明確定義的空中視角場景
|
142 |
+
aerial_activities = [
|
143 |
+
"Street crossing",
|
144 |
+
"Waiting for signals",
|
145 |
+
"Following traffic rules",
|
146 |
+
"Pedestrian movement"
|
147 |
+
]
|
148 |
+
activities.extend(aerial_activities)
|
149 |
+
|
150 |
+
if scene_type in self.activity_templates:
|
151 |
+
activities.extend(self.activity_templates[scene_type])
|
152 |
+
elif "default" in self.activity_templates:
|
153 |
+
activities.extend(self.activity_templates["default"])
|
154 |
+
|
155 |
+
detected_class_ids = [obj["class_id"] for obj in detected_objects]
|
156 |
+
|
157 |
+
# Add activities based on specific object combinations
|
158 |
+
if 62 in detected_class_ids and 57 in detected_class_ids: # TV and sofa
|
159 |
+
activities.append("Watching shows or movies")
|
160 |
+
|
161 |
+
if 63 in detected_class_ids: # laptop
|
162 |
+
activities.append("Using a computer/laptop")
|
163 |
+
|
164 |
+
if 67 in detected_class_ids: # cell phone
|
165 |
+
activities.append("Using a mobile phone")
|
166 |
+
|
167 |
+
if 73 in detected_class_ids: # book
|
168 |
+
activities.append("Reading")
|
169 |
+
|
170 |
+
if any(food_id in detected_class_ids for food_id in [46, 47, 48, 49, 50, 51, 52, 53, 54, 55]):
|
171 |
+
activities.append("Eating or preparing food")
|
172 |
+
|
173 |
+
# Person-specific activities
|
174 |
+
if 0 in detected_class_ids: # Person
|
175 |
+
if any(vehicle in detected_class_ids for vehicle in [1, 2, 3, 5, 7]): # Vehicles
|
176 |
+
activities.append("Commuting or traveling")
|
177 |
+
|
178 |
+
if 16 in detected_class_ids: # Dog
|
179 |
+
activities.append("Walking a dog")
|
180 |
+
|
181 |
+
if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
|
182 |
+
activities.append("Carrying personal items")
|
183 |
+
|
184 |
+
# Remove duplicates
|
185 |
+
return list(set(activities))
|
186 |
+
|
187 |
+
def _identify_safety_concerns(self, detected_objects: List[Dict], scene_type: str) -> List[str]:
|
188 |
+
"""
|
189 |
+
Identify potential safety concerns based on objects and scene type.
|
190 |
+
|
191 |
+
Args:
|
192 |
+
detected_objects: List of detected objects
|
193 |
+
scene_type: Identified scene type
|
194 |
+
|
195 |
+
Returns:
|
196 |
+
List of potential safety concerns
|
197 |
+
"""
|
198 |
+
concerns = []
|
199 |
+
detected_class_ids = [obj["class_id"] for obj in detected_objects]
|
200 |
+
|
201 |
+
# ORIGINAL SAFETY CONCERNS LOGIC
|
202 |
+
|
203 |
+
# General safety concerns
|
204 |
+
if 42 in detected_class_ids or 43 in detected_class_ids: # Fork or knife
|
205 |
+
concerns.append("Sharp utensils present")
|
206 |
+
|
207 |
+
if 76 in detected_class_ids: # Scissors
|
208 |
+
concerns.append("Cutting tools present")
|
209 |
+
|
210 |
+
# Traffic-related concerns
|
211 |
+
if scene_type in ["city_street", "parking_lot"]:
|
212 |
+
if 0 in detected_class_ids: # Person
|
213 |
+
if any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7, 8]): # Vehicles
|
214 |
+
concerns.append("Pedestrians near vehicles")
|
215 |
+
|
216 |
+
if 9 in detected_class_ids: # Traffic light
|
217 |
+
concerns.append("Monitor traffic signals")
|
218 |
+
|
219 |
+
# Identify crowded scenes
|
220 |
+
person_count = detected_class_ids.count(0)
|
221 |
+
if person_count > 5:
|
222 |
+
concerns.append(f"Crowded area with multiple people ({person_count})")
|
223 |
+
|
224 |
+
# Scene-specific concerns
|
225 |
+
if scene_type == "kitchen":
|
226 |
+
if 68 in detected_class_ids or 69 in detected_class_ids: # Microwave or oven
|
227 |
+
concerns.append("Hot cooking equipment")
|
228 |
+
|
229 |
+
# Potentially unstable objects
|
230 |
+
for obj in detected_objects:
|
231 |
+
if obj["class_id"] in [39, 40, 41, 45]: # Bottle, wine glass, cup, bowl
|
232 |
+
if obj["region"] in ["top_left", "top_center", "top_right"] and obj["normalized_area"] > 0.05:
|
233 |
+
concerns.append(f"Elevated {obj['class_name']} might be unstable")
|
234 |
+
|
235 |
+
# NEW SAFETY CONCERNS LOGIC FOR ADDITIONAL SCENE TYPES
|
236 |
+
|
237 |
+
# Upscale dining safety concerns
|
238 |
+
if scene_type == "upscale_dining":
|
239 |
+
# Check for fragile items
|
240 |
+
if 40 in detected_class_ids: # Wine glass
|
241 |
+
concerns.append("Fragile glassware present")
|
242 |
+
|
243 |
+
# Check for lit candles (can't directly detect but can infer from context)
|
244 |
+
# Look for small bright spots that might be candles
|
245 |
+
if any(obj["class_id"] == 41 for obj in detected_objects): # Cup (which might include candle holders)
|
246 |
+
# We can't reliably detect candles, but if the scene appears to be formal dining,
|
247 |
+
# we can suggest this as a possibility
|
248 |
+
concerns.append("Possible lit candles or decorative items requiring care")
|
249 |
+
|
250 |
+
# Check for overcrowded table
|
251 |
+
table_objs = [obj for obj in detected_objects if obj["class_id"] == 60] # Dining table
|
252 |
+
if table_objs:
|
253 |
+
table_region = table_objs[0]["region"]
|
254 |
+
items_on_table = 0
|
255 |
+
|
256 |
+
for obj in detected_objects:
|
257 |
+
if obj["class_id"] in [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]:
|
258 |
+
if obj["region"] == table_region:
|
259 |
+
items_on_table += 1
|
260 |
+
|
261 |
+
if items_on_table > 8:
|
262 |
+
concerns.append("Dining table has multiple items which should be handled with care")
|
263 |
+
|
264 |
+
# Asian commercial street safety concerns
|
265 |
+
elif scene_type == "asian_commercial_street":
|
266 |
+
# Check for crowded walkways
|
267 |
+
if 0 in detected_class_ids: # Person
|
268 |
+
person_count = detected_class_ids.count(0)
|
269 |
+
if person_count > 3:
|
270 |
+
# Calculate person density (simplified)
|
271 |
+
person_positions = []
|
272 |
+
for obj in detected_objects:
|
273 |
+
if obj["class_id"] == 0:
|
274 |
+
person_positions.append(obj["normalized_center"])
|
275 |
+
|
276 |
+
if len(person_positions) >= 2:
|
277 |
+
# Calculate average distance between people
|
278 |
+
total_distance = 0
|
279 |
+
count = 0
|
280 |
+
for i in range(len(person_positions)):
|
281 |
+
for j in range(i+1, len(person_positions)):
|
282 |
+
p1 = person_positions[i]
|
283 |
+
p2 = person_positions[j]
|
284 |
+
distance = ((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)**0.5
|
285 |
+
total_distance += distance
|
286 |
+
count += 1
|
287 |
+
|
288 |
+
if count > 0:
|
289 |
+
avg_distance = total_distance / count
|
290 |
+
if avg_distance < 0.1: # Close proximity
|
291 |
+
concerns.append("Crowded walkway with limited personal space")
|
292 |
+
|
293 |
+
# Check for motorcycles/bicycles near pedestrians
|
294 |
+
if (1 in detected_class_ids or 3 in detected_class_ids) and 0 in detected_class_ids: # Bicycle/motorcycle and person
|
295 |
+
concerns.append("Two-wheeled vehicles in pedestrian areas")
|
296 |
+
|
297 |
+
# Check for potential trip hazards
|
298 |
+
# We can't directly detect this, but can infer from context
|
299 |
+
if scene_type == "asian_commercial_street" and "bottom" in " ".join([obj["region"] for obj in detected_objects if obj["class_id"] == 0]):
|
300 |
+
# If people are in bottom regions, they might be walking on uneven surfaces
|
301 |
+
concerns.append("Potential uneven walking surfaces in commercial area")
|
302 |
+
|
303 |
+
# Financial district safety concerns
|
304 |
+
elif scene_type == "financial_district":
|
305 |
+
# Check for heavy traffic conditions
|
306 |
+
vehicle_count = sum(1 for obj_id in detected_class_ids if obj_id in [2, 5, 7]) # Car, bus, truck
|
307 |
+
if vehicle_count > 5:
|
308 |
+
concerns.append("Heavy vehicle traffic in urban area")
|
309 |
+
|
310 |
+
# Check for pedestrians crossing busy streets
|
311 |
+
if 0 in detected_class_ids: # Person
|
312 |
+
person_count = detected_class_ids.count(0)
|
313 |
+
vehicle_nearby = any(vehicle in detected_class_ids for vehicle in [2, 3, 5, 7])
|
314 |
+
|
315 |
+
if person_count > 0 and vehicle_nearby:
|
316 |
+
concerns.append("Pedestrians navigating busy urban traffic")
|
317 |
+
|
318 |
+
# Check for traffic signals
|
319 |
+
if 9 in detected_class_ids: # Traffic light
|
320 |
+
concerns.append("Observe traffic signals when navigating this area")
|
321 |
+
else:
|
322 |
+
# If no traffic lights detected but it's a busy area, it's worth noting
|
323 |
+
if vehicle_count > 3:
|
324 |
+
concerns.append("Busy traffic area potentially without visible traffic signals in view")
|
325 |
+
|
326 |
+
# Time of day considerations
|
327 |
+
# We don't have direct time data, but can infer from vehicle lights
|
328 |
+
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 5, 7]]
|
329 |
+
if vehicle_objs and any("lighting_conditions" in obj for obj in detected_objects):
|
330 |
+
# If vehicles are present and it might be evening/night
|
331 |
+
concerns.append("Reduced visibility conditions during evening commute")
|
332 |
+
|
333 |
+
# Urban intersection safety concerns
|
334 |
+
elif scene_type == "urban_intersection":
|
335 |
+
# Check for pedestrians in crosswalks
|
336 |
+
pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
337 |
+
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [2, 3, 5, 7]]
|
338 |
+
|
339 |
+
if pedestrian_objs:
|
340 |
+
# Calculate distribution of pedestrians to see if they're crossing
|
341 |
+
pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
|
342 |
+
|
343 |
+
# Simplified check for pedestrians in crossing pattern
|
344 |
+
if len(pedestrian_positions) >= 3:
|
345 |
+
# Check if pedestrians are distributed across different regions
|
346 |
+
pedestrian_regions = set(obj["region"] for obj in pedestrian_objs)
|
347 |
+
if len(pedestrian_regions) >= 2:
|
348 |
+
concerns.append("Multiple pedestrians crossing the intersection")
|
349 |
+
|
350 |
+
# Check for traffic signal observation
|
351 |
+
if 9 in detected_class_ids: # Traffic light
|
352 |
+
concerns.append("Observe traffic signals when crossing")
|
353 |
+
|
354 |
+
# Check for busy intersection
|
355 |
+
if len(vehicle_objs) > 3:
|
356 |
+
concerns.append("Busy intersection with multiple vehicles")
|
357 |
+
|
358 |
+
# Check for pedestrians potentially jay-walking
|
359 |
+
if pedestrian_objs and not 9 in detected_class_ids: # People but no traffic lights
|
360 |
+
concerns.append("Pedestrians should use designated crosswalks")
|
361 |
+
|
362 |
+
# Visibility concerns based on lighting
|
363 |
+
# This would be better with actual lighting data
|
364 |
+
pedestrian_count = len(pedestrian_objs)
|
365 |
+
if pedestrian_count > 5:
|
366 |
+
concerns.append("High pedestrian density at crossing points")
|
367 |
+
|
368 |
+
# Transit hub safety concerns
|
369 |
+
elif scene_type == "transit_hub":
|
370 |
+
# These would be for transit areas like train stations or bus terminals
|
371 |
+
if 0 in detected_class_ids: # Person
|
372 |
+
person_count = detected_class_ids.count(0)
|
373 |
+
if person_count > 8:
|
374 |
+
concerns.append("Crowded transit area requiring careful navigation")
|
375 |
+
|
376 |
+
# Check for luggage/bags that could be trip hazards
|
377 |
+
if 24 in detected_class_ids or 28 in detected_class_ids: # Backpack or suitcase
|
378 |
+
concerns.append("Luggage and personal items may create obstacles")
|
379 |
+
|
380 |
+
# Public transportation vehicles
|
381 |
+
if any(vehicle in detected_class_ids for vehicle in [5, 6, 7]): # Bus, train, truck
|
382 |
+
concerns.append("Stay clear of arriving and departing transit vehicles")
|
383 |
+
|
384 |
+
# Shopping district safety concerns
|
385 |
+
elif scene_type == "shopping_district":
|
386 |
+
# Check for crowded shopping areas
|
387 |
+
if 0 in detected_class_ids: # Person
|
388 |
+
person_count = detected_class_ids.count(0)
|
389 |
+
if person_count > 5:
|
390 |
+
concerns.append("Crowded shopping area with multiple people")
|
391 |
+
|
392 |
+
# Check for shopping bags and personal items
|
393 |
+
if 24 in detected_class_ids or 26 in detected_class_ids: # Backpack or handbag
|
394 |
+
concerns.append("Mind personal belongings in busy retail environment")
|
395 |
+
|
396 |
+
# Check for store entrances/exits which might have automatic doors
|
397 |
+
# We can't directly detect this, but can infer from context
|
398 |
+
if scene_type == "shopping_district" and 0 in detected_class_ids:
|
399 |
+
concerns.append("Be aware of store entrances and exits with potential automatic doors")
|
400 |
+
|
401 |
+
return concerns
|
scene_detail_templates.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
SCENE_DETAIL_TEMPLATES = {
|
3 |
+
"living_room": [
|
4 |
+
"The space is arranged for relaxation with {furniture}.",
|
5 |
+
"There is {electronics} for entertainment.",
|
6 |
+
"The room has a seating area with {seating}."
|
7 |
+
],
|
8 |
+
"bedroom": [
|
9 |
+
"The room contains {bed_type} in the {bed_location}.",
|
10 |
+
"This sleeping area has {bed_description}.",
|
11 |
+
"A personal space with {bed_type} and {extras}."
|
12 |
+
],
|
13 |
+
"dining_area": [
|
14 |
+
"A space set up for meals with {table_setup}.",
|
15 |
+
"The dining area contains {table_description}.",
|
16 |
+
"A place for eating with {dining_items}."
|
17 |
+
],
|
18 |
+
"kitchen": [
|
19 |
+
"A food preparation area with {appliances}.",
|
20 |
+
"The kitchen contains {kitchen_items}.",
|
21 |
+
"A cooking space equipped with {cooking_equipment}."
|
22 |
+
],
|
23 |
+
"office_workspace": [
|
24 |
+
"A work environment with {office_equipment}.",
|
25 |
+
"A space designed for productivity with {desk_setup}.",
|
26 |
+
"A workspace containing {computer_equipment}."
|
27 |
+
],
|
28 |
+
"city_street": [
|
29 |
+
"An urban thoroughfare with {traffic_description}.",
|
30 |
+
"A street scene with {people_and_vehicles}.",
|
31 |
+
"A city path with {street_elements}."
|
32 |
+
],
|
33 |
+
"park_area": [
|
34 |
+
"An outdoor recreational space with {park_features}.",
|
35 |
+
"A leisure area featuring {outdoor_elements}.",
|
36 |
+
"A public outdoor space with {park_description}."
|
37 |
+
],
|
38 |
+
"retail_store": [
|
39 |
+
"A shopping environment with {store_elements}.",
|
40 |
+
"A commercial space where {shopping_activity}.",
|
41 |
+
"A retail area containing {store_items}."
|
42 |
+
],
|
43 |
+
"upscale_dining": [
|
44 |
+
"The space features {furniture} with {design_elements} for an elegant dining experience.",
|
45 |
+
"This sophisticated dining area includes {lighting} illuminating {table_setup}.",
|
46 |
+
"A stylish dining environment with {seating} arranged around {table_description}."
|
47 |
+
],
|
48 |
+
"asian_commercial_street": [
|
49 |
+
"A vibrant street lined with {storefront_features} and filled with {pedestrian_flow}.",
|
50 |
+
"This urban commercial area displays {asian_elements} with {cultural_elements}.",
|
51 |
+
"A lively shopping street characterized by {signage} and busy with {street_activities}."
|
52 |
+
],
|
53 |
+
"financial_district": [
|
54 |
+
"A canyon of {buildings} with {traffic_elements} moving through the urban landscape.",
|
55 |
+
"This business district features {skyscrapers} along {road_features}.",
|
56 |
+
"A downtown corridor with {architectural_elements} framing views of {city_landmarks}."
|
57 |
+
],
|
58 |
+
"urban_intersection": [
|
59 |
+
"A busy crossroad with {crossing_pattern} where {pedestrian_behavior} is observed.",
|
60 |
+
"This urban junction features {pedestrian_density} navigating the {traffic_pattern}.",
|
61 |
+
"A well-marked intersection designed for {pedestrian_flow} across multiple directions."
|
62 |
+
],
|
63 |
+
"transit_hub": [
|
64 |
+
"A transportation nexus where {transit_vehicles} arrive and depart amid {passenger_activity}.",
|
65 |
+
"This transit center accommodates {transportation_modes} with facilities for {passenger_needs}.",
|
66 |
+
"A busy transport hub featuring {transit_infrastructure} and areas for {passenger_movement}."
|
67 |
+
],
|
68 |
+
"shopping_district": [
|
69 |
+
"A commercial zone filled with {retail_elements} and {shopping_activity}.",
|
70 |
+
"This shopping area features {store_types} along {walkway_features}.",
|
71 |
+
"A retail district characterized by {commercial_signage} and {consumer_behavior}."
|
72 |
+
],
|
73 |
+
"bus_stop": [
|
74 |
+
"Passengers waiting at a roadside stop served by {transit_vehicles}.",
|
75 |
+
"A designated bus stop with shelters and {passenger_activity}.",
|
76 |
+
"Commuters boarding or alighting from {transit_vehicles} at the curb."
|
77 |
+
],
|
78 |
+
"bus_station": [
|
79 |
+
"Multiple buses parked in a terminal where {passenger_activity}.",
|
80 |
+
"A busy station hub featuring {transit_vehicles} and traveler luggage.",
|
81 |
+
"A transit center with waiting areas and various {transportation_modes}."
|
82 |
+
],
|
83 |
+
"zoo": [
|
84 |
+
"Enclosures showcasing elephants, zebras, and giraffes with visitors observing.",
|
85 |
+
"A wildlife exhibit area where families watch animal displays.",
|
86 |
+
"A recreational space featuring large animal exhibits and strolling guests."
|
87 |
+
],
|
88 |
+
"harbor": [
|
89 |
+
"Boats docked along the waterfront with nearby vehicular traffic.",
|
90 |
+
"A maritime area where vessels anchor beside roads busy with cars and motorcycles.",
|
91 |
+
"A coastal dock featuring moored boats and passing traffic elements."
|
92 |
+
],
|
93 |
+
"playground": [
|
94 |
+
"An open play area equipped with balls and recreational structures.",
|
95 |
+
"People engaging in games and sports in a communal space.",
|
96 |
+
"A leisure area featuring playground equipment and active participants."
|
97 |
+
],
|
98 |
+
"sports_field": [
|
99 |
+
"An athletic field marked for various ball games and matches.",
|
100 |
+
"Players using equipment like bats, gloves, and rackets on a grassy pitch.",
|
101 |
+
"A designated sports area with goalposts or markings for competitive play."
|
102 |
+
],
|
103 |
+
"narrow_commercial_alley": [
|
104 |
+
"A tight alley lined with {storefront_features} and light vehicles.",
|
105 |
+
"Pedestrians navigate a confined lane flanked by shops and {street_activities}.",
|
106 |
+
"An urban passage featuring {storefront_features} with {people_and_vehicles}."
|
107 |
+
],
|
108 |
+
"daytime_shopping_street": [
|
109 |
+
"A bustling street during daytime with {storefront_features} and {pedestrian_flow}.",
|
110 |
+
"Shoppers and vehicles move along a retail strip marked by {signage}.",
|
111 |
+
"An open commercial avenue filled with {people_and_vehicles} amid shops."
|
112 |
+
],
|
113 |
+
"urban_pedestrian_crossing": [
|
114 |
+
"A marked crosswalk with {crossing_pattern} under {lighting_modifier} sky.",
|
115 |
+
"Pedestrians use designated crossing with {traffic_pattern} at the intersection.",
|
116 |
+
"People waiting at a signal-controlled crossing next to {street_elements}."
|
117 |
+
],
|
118 |
+
"aerial_view_intersection": [
|
119 |
+
"The crossing pattern shows {crossing_pattern} with {pedestrian_flow} across multiple directions.",
|
120 |
+
"From above, this intersection reveals {traffic_pattern} with {pedestrian_density} navigating through defined paths.",
|
121 |
+
"This bird's-eye view shows {street_elements} converging at a junction where {pedestrian_behavior} is visible."
|
122 |
+
],
|
123 |
+
"aerial_view_commercial_area": [
|
124 |
+
"From above, this commercial zone shows {storefront_features} with {pedestrian_flow} moving between establishments.",
|
125 |
+
"This overhead view reveals {shopping_activity} amid {walkway_features} connecting different businesses.",
|
126 |
+
"The aerial perspective captures {retail_elements} organized along {commercial_layout} with visible customer activity."
|
127 |
+
],
|
128 |
+
"aerial_view_plaza": [
|
129 |
+
"This overhead view of the plaza shows {pedestrian_pattern} across an open public space.",
|
130 |
+
"From above, the plaza reveals {gathering_features} where people congregate in {movement_pattern}.",
|
131 |
+
"The aerial perspective captures {urban_elements} arranged around a central area where {public_activity} occurs."
|
132 |
+
],
|
133 |
+
"asian_night_market": [
|
134 |
+
"This bustling night market features {stall_elements} illuminated by {lighting_features} with crowds enjoying {food_elements}.",
|
135 |
+
"Rows of {vendor_stalls} line this vibrant market where {nighttime_activity} continues under {cultural_lighting}.",
|
136 |
+
"The market atmosphere is created by {asian_elements} and {night_market_sounds} amid {evening_crowd_behavior}."
|
137 |
+
],
|
138 |
+
"asian_temple_area": [
|
139 |
+
"This sacred space features {architectural_elements} displaying {cultural_symbols} with visitors engaging in {ritual_activities}.",
|
140 |
+
"The temple area contains {religious_structures} adorned with {decorative_features} where people practice {cultural_practices}.",
|
141 |
+
"Traditional {temple_architecture} creates a spiritual atmosphere enhanced by {sensory_elements} and {visitor_activities}."
|
142 |
+
],
|
143 |
+
"european_plaza": [
|
144 |
+
"This historic plaza is framed by {architectural_style} surrounding an open space where {public_activities} take place.",
|
145 |
+
"The European square features {historic_elements} and {urban_design} creating a space for {social_behaviors}.",
|
146 |
+
"Classical {european_features} define this public space where {tourist_activities} blend with {local_customs}."
|
147 |
+
],
|
148 |
+
"nighttime_street": [
|
149 |
+
"The night transforms this street with {lighting_effects} casting {shadow_patterns} across {urban_features}.",
|
150 |
+
"After dark, this urban corridor is defined by {illuminated_elements} with {evening_activities} visible in the artificial light.",
|
151 |
+
"The nocturnal street scene captures {light_sources} creating contrast between {lit_areas} and {shadowed_zones}."
|
152 |
+
],
|
153 |
+
"nighttime_commercial_district": [
|
154 |
+
"After sunset, this commercial area comes alive with {illuminated_signage} and {evening_activities} under {colorful_lighting}.",
|
155 |
+
"The district's nighttime character is defined by {neon_elements} highlighting {storefront_features} amid {night_crowd_behavior}.",
|
156 |
+
"Evening transforms this zone through {light_displays} that accentuate {building_features} and frame {nightlife_activities}."
|
157 |
+
],
|
158 |
+
"indoor_outdoor_cafe": [
|
159 |
+
"This cafe blends indoor comfort with outdoor atmosphere through {transitional_elements} connecting {indoor_features} with {outdoor_setting}.",
|
160 |
+
"Customers enjoy both {interior_amenities} and {exterior_features} in this space that bridges indoor comfort and outdoor ambiance.",
|
161 |
+
"The cafe design creates flow between {inside_elements} and {outside_spaces} allowing patrons to experience {dual_environment_benefits}."
|
162 |
+
],
|
163 |
+
"transit_station_platform": [
|
164 |
+
"This transit platform combines covered areas with open sections where {passenger_activities} occur while awaiting {transportation_types}.",
|
165 |
+
"The station design balances {sheltered_elements} with {exposed_areas} for passengers engaged in {waiting_behaviors}.",
|
166 |
+
"Commuters navigate between {indoor_facilities} and {platform_features} while {transit_routines} unfold around arriving vehicles."
|
167 |
+
],
|
168 |
+
"sports_stadium": [
|
169 |
+
"This athletic venue features {seating_arrangement} surrounding {playing_surface} where {sporting_activities} take place.",
|
170 |
+
"The stadium design incorporates {spectator_facilities} overlooking {competition_space} designed for {sports_events}.",
|
171 |
+
"Fans occupy {viewing_areas} arranged to maximize visibility of {field_elements} where athletes engage in {game_activities}."
|
172 |
+
],
|
173 |
+
"construction_site": [
|
174 |
+
"This development area shows {construction_equipment} amid {building_materials} where workers conduct {construction_activities}.",
|
175 |
+
"The construction process is visible through {work_elements} positioned around {structural_components} in various stages of completion.",
|
176 |
+
"Workers utilize {site_equipment} to transform {raw_materials} following {construction_process} stages."
|
177 |
+
],
|
178 |
+
"medical_facility": [
|
179 |
+
"This healthcare environment features {medical_elements} arranged to support {clinical_activities} in a {facility_design}.",
|
180 |
+
"The medical space incorporates {healthcare_features} where {patient_interactions} occur in a controlled environment.",
|
181 |
+
"Professional medical staff utilize {equipment_types} while conducting {care_procedures} in specialized {treatment_spaces}."
|
182 |
+
],
|
183 |
+
"educational_setting": [
|
184 |
+
"This learning environment contains {educational_furniture} arranged to facilitate {learning_activities} through {instructional_design}.",
|
185 |
+
"The educational space features {classroom_elements} organized for {teaching_methods} and {student_engagement}.",
|
186 |
+
"Students and educators interact within {learning_spaces} equipped with {educational_tools} supporting {knowledge_transfer}."
|
187 |
+
],
|
188 |
+
"beach_water_recreation": [
|
189 |
+
"A coastal recreation area with {beach_equipment} and people enjoying {water_activities}.",
|
190 |
+
"This shoreline space features {beach_equipment} where visitors engage in {water_activities}.",
|
191 |
+
"An outdoor water recreation zone with {beach_equipment} set up for {water_activities}."
|
192 |
+
],
|
193 |
+
"sports_venue": [
|
194 |
+
"A professional sports facility with {sports_equipment} arranged for {competitive_activities}.",
|
195 |
+
"This athletics venue features {sports_equipment} with spaces designated for {competitive_activities}.",
|
196 |
+
"A specialized sports arena containing {sports_equipment} designed for {competitive_activities}."
|
197 |
+
],
|
198 |
+
"professional_kitchen": [
|
199 |
+
"A commercial cooking space with {kitchen_equipment} organized for {food_preparation}.",
|
200 |
+
"This professional culinary area contains {kitchen_equipment} arranged in stations for {food_preparation}.",
|
201 |
+
"An industrial kitchen featuring {kitchen_equipment} designed for efficient {food_preparation}."
|
202 |
+
],
|
203 |
+
}
|
scene_type.py
ADDED
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
SCENE_TYPES = {
|
3 |
+
"living_room": {
|
4 |
+
"name": "Living Room",
|
5 |
+
"required_objects": [57, 62], # couch, tv
|
6 |
+
"optional_objects": [56, 60, 73, 75], # chair, dining table, book, vase
|
7 |
+
"minimum_required": 2,
|
8 |
+
"description": "A living room area with furniture for relaxation and entertainment"
|
9 |
+
},
|
10 |
+
"bedroom": {
|
11 |
+
"name": "Bedroom",
|
12 |
+
"required_objects": [59], # bed
|
13 |
+
"optional_objects": [56, 60, 73, 74, 75], # chair, dining table, book, clock, vase
|
14 |
+
"minimum_required": 1,
|
15 |
+
"description": "A bedroom with sleeping furniture"
|
16 |
+
},
|
17 |
+
"dining_area": {
|
18 |
+
"name": "Dining Area",
|
19 |
+
"required_objects": [60], # dining table
|
20 |
+
"optional_objects": [56, 39, 41, 42, 43, 44, 45], # chair, bottle, cup, fork, knife, spoon, bowl
|
21 |
+
"minimum_required": 1,
|
22 |
+
"description": "A dining area for meals"
|
23 |
+
},
|
24 |
+
"kitchen": {
|
25 |
+
"name": "Kitchen",
|
26 |
+
"required_objects": [72, 68, 69, 71], # refrigerator, microwave, oven, sink
|
27 |
+
"optional_objects": [39, 41, 42, 43, 44, 45], # bottle, cup, fork, knife, spoon, bowl
|
28 |
+
"minimum_required": 1,
|
29 |
+
"description": "A kitchen area for food preparation"
|
30 |
+
},
|
31 |
+
"office_workspace": {
|
32 |
+
"name": "Office Workspace",
|
33 |
+
"required_objects": [56, 63, 66, 64, 73], # chair, laptop, keyboard, mouse, book
|
34 |
+
"optional_objects": [60, 74, 75, 67], # dining table, clock, vase, cell phone
|
35 |
+
"minimum_required": 2,
|
36 |
+
"description": "A workspace with computer equipment for office work"
|
37 |
+
},
|
38 |
+
"meeting_room": {
|
39 |
+
"name": "Meeting Room",
|
40 |
+
"required_objects": [56, 60], # chair, dining table
|
41 |
+
"optional_objects": [63, 62, 67], # laptop, tv, cell phone
|
42 |
+
"minimum_required": 2,
|
43 |
+
"description": "A room set up for meetings with multiple seating"
|
44 |
+
},
|
45 |
+
"city_street": {
|
46 |
+
"name": "City Street",
|
47 |
+
"required_objects": [0, 1, 2, 3, 5, 7, 9], # person, bicycle, car, motorcycle, bus, truck, traffic light
|
48 |
+
"optional_objects": [10, 11, 12, 24, 25, 26, 28], # fire hydrant, stop sign, parking meter, backpack, umbrella, handbag, suitcase
|
49 |
+
"minimum_required": 2,
|
50 |
+
"description": "A city street with traffic and pedestrians"
|
51 |
+
},
|
52 |
+
"parking_lot": {
|
53 |
+
"name": "Parking Lot",
|
54 |
+
"required_objects": [2, 3, 5, 7], # car, motorcycle, bus, truck
|
55 |
+
"optional_objects": [0, 11, 12], # person, stop sign, parking meter
|
56 |
+
"minimum_required": 3,
|
57 |
+
"description": "A parking area with multiple vehicles"
|
58 |
+
},
|
59 |
+
"park_area": {
|
60 |
+
"name": "Park or Recreation Area",
|
61 |
+
"required_objects": [0, 13], # person, bench
|
62 |
+
"optional_objects": [1, 14, 16, 25, 33], # bicycle, bird, dog, umbrella, kite
|
63 |
+
"minimum_required": 2,
|
64 |
+
"description": "An outdoor recreational area for leisure activities"
|
65 |
+
},
|
66 |
+
"retail_store": {
|
67 |
+
"name": "Retail Store",
|
68 |
+
"required_objects": [0, 24, 26, 28], # person, backpack, handbag, suitcase
|
69 |
+
"optional_objects": [39, 45, 67], # bottle, bowl, cell phone
|
70 |
+
"minimum_required": 2,
|
71 |
+
"description": "A retail environment with shoppers and merchandise"
|
72 |
+
},
|
73 |
+
"supermarket": {
|
74 |
+
"name": "Supermarket",
|
75 |
+
"required_objects": [0, 24, 39, 46, 47, 49], # person, backpack, bottle, banana, apple, orange
|
76 |
+
"optional_objects": [26, 37, 45, 48, 51, 52, 53, 54, 55], # handbag, surfboard, bowl, sandwich, carrot, hot dog, pizza, donut, cake
|
77 |
+
"minimum_required": 3,
|
78 |
+
"description": "A supermarket with food items and shoppers"
|
79 |
+
},
|
80 |
+
"classroom": {
|
81 |
+
"name": "Classroom",
|
82 |
+
"required_objects": [56, 60, 73], # chair, dining table, book
|
83 |
+
"optional_objects": [63, 66, 67], # laptop, keyboard, cell phone
|
84 |
+
"minimum_required": 2,
|
85 |
+
"description": "A classroom environment set up for educational activities"
|
86 |
+
},
|
87 |
+
"conference_room": {
|
88 |
+
"name": "Conference Room",
|
89 |
+
"required_objects": [56, 60, 63], # chair, dining table, laptop
|
90 |
+
"optional_objects": [62, 67, 73], # tv, cell phone, book
|
91 |
+
"minimum_required": 2,
|
92 |
+
"description": "A conference room designed for meetings and presentations"
|
93 |
+
},
|
94 |
+
"cafe": {
|
95 |
+
"name": "Cafe",
|
96 |
+
"required_objects": [56, 60, 41], # chair, dining table, cup
|
97 |
+
"optional_objects": [39, 40, 63, 67, 73], # bottle, wine glass, laptop, cell phone, book
|
98 |
+
"minimum_required": 2,
|
99 |
+
"description": "A cafe setting with seating and beverages"
|
100 |
+
},
|
101 |
+
"library": {
|
102 |
+
"name": "Library",
|
103 |
+
"required_objects": [56, 60, 73], # chair, dining table, book
|
104 |
+
"optional_objects": [63, 67, 75], # laptop, cell phone, vase
|
105 |
+
"minimum_required": 2,
|
106 |
+
"description": "A library with books and reading areas"
|
107 |
+
},
|
108 |
+
"gym": {
|
109 |
+
"name": "Gym",
|
110 |
+
"required_objects": [0, 32], # person, sports ball
|
111 |
+
"optional_objects": [24, 25, 28, 38], # backpack, umbrella, suitcase, tennis racket
|
112 |
+
"minimum_required": 1,
|
113 |
+
"description": "A gym or fitness area for physical activities"
|
114 |
+
},
|
115 |
+
"beach": {
|
116 |
+
"name": "Beach",
|
117 |
+
"required_objects": [0, 25, 29, 33, 37], # person, umbrella, frisbee, kite, surfboard
|
118 |
+
"optional_objects": [1, 24, 26, 38], # bicycle, backpack, handbag, tennis racket
|
119 |
+
"minimum_required": 2,
|
120 |
+
"description": "A beach area with people and recreational items"
|
121 |
+
},
|
122 |
+
"restaurant": {
|
123 |
+
"name": "Restaurant",
|
124 |
+
"required_objects": [56, 60, 41, 42, 43, 44, 45], # chair, dining table, cup, fork, knife, spoon, bowl
|
125 |
+
"optional_objects": [39, 40, 48, 49, 50, 51, 52, 53, 54, 55], # bottle, wine glass, sandwich, orange, broccoli, carrot, hot dog, pizza, donut, cake
|
126 |
+
"minimum_required": 3,
|
127 |
+
"description": "A restaurant setting for dining with tables and eating utensils"
|
128 |
+
},
|
129 |
+
"train_station": {
|
130 |
+
"name": "Train Station",
|
131 |
+
"required_objects": [0, 6], # person, train
|
132 |
+
"optional_objects": [1, 2, 24, 28, 67], # bicycle, car, backpack, suitcase, cell phone
|
133 |
+
"minimum_required": 1,
|
134 |
+
"description": "A train station with train and passengers"
|
135 |
+
},
|
136 |
+
"airport": {
|
137 |
+
"name": "Airport",
|
138 |
+
"required_objects": [0, 4, 28], # person, airplane, suitcase
|
139 |
+
"optional_objects": [24, 25, 26, 67], # backpack, umbrella, handbag, cell phone
|
140 |
+
"minimum_required": 2,
|
141 |
+
"description": "An airport with planes and travelers carrying luggage"
|
142 |
+
},
|
143 |
+
"upscale_dining": {
|
144 |
+
"name": "Upscale Dining Area",
|
145 |
+
"required_objects": [56, 60, 40, 41], # chair, dining table, wine glass, cup
|
146 |
+
"optional_objects": [39, 42, 43, 44, 45, 62, 75], # bottle, fork, knife, spoon, bowl, tv, vase
|
147 |
+
"minimum_required": 2,
|
148 |
+
"description": "An elegantly designed dining space with refined furniture and decorative elements"
|
149 |
+
},
|
150 |
+
"asian_commercial_street": {
|
151 |
+
"name": "Asian Commercial Street",
|
152 |
+
"required_objects": [0, 67], # person, cell phone
|
153 |
+
"optional_objects": [1, 2, 3, 24, 25, 26, 28], # bicycle, car, motorcycle, backpack, umbrella, handbag, suitcase
|
154 |
+
"minimum_required": 1,
|
155 |
+
"description": "A bustling commercial street with shops, signage, and pedestrians in an Asian urban setting"
|
156 |
+
},
|
157 |
+
"financial_district": {
|
158 |
+
"name": "Financial District",
|
159 |
+
"required_objects": [2, 5, 7, 9], # car, bus, truck, traffic light
|
160 |
+
"optional_objects": [0, 1, 3, 8], # person, bicycle, motorcycle, boat
|
161 |
+
"minimum_required": 2,
|
162 |
+
"description": "A major thoroughfare in a business district with high-rise buildings and traffic"
|
163 |
+
},
|
164 |
+
"urban_intersection": {
|
165 |
+
"name": "Urban Intersection",
|
166 |
+
"required_objects": [0, 9], # person, traffic light
|
167 |
+
"optional_objects": [1, 2, 3, 5, 7], # bicycle, car, motorcycle, bus, truck
|
168 |
+
"minimum_required": 1,
|
169 |
+
"description": "A busy urban crossroad with pedestrian crossings and multiple traffic flows"
|
170 |
+
},
|
171 |
+
"transit_hub": {
|
172 |
+
"name": "Transit Hub",
|
173 |
+
"required_objects": [0, 5, 6, 7], # person, bus, train, truck
|
174 |
+
"optional_objects": [1, 2, 3, 9, 24, 28], # bicycle, car, motorcycle, traffic light, backpack, suitcase
|
175 |
+
"minimum_required": 2,
|
176 |
+
"description": "A transportation center where multiple modes of transit converge"
|
177 |
+
},
|
178 |
+
"shopping_district": {
|
179 |
+
"name": "Shopping District",
|
180 |
+
"required_objects": [0, 24, 26], # person, backpack, handbag
|
181 |
+
"optional_objects": [1, 2, 3, 25, 27, 28, 39, 67], # bicycle, car, motorcycle, umbrella, tie, suitcase, bottle, cell phone
|
182 |
+
"minimum_required": 2,
|
183 |
+
"description": "A retail-focused area with shops, pedestrians, and commercial activity"
|
184 |
+
},
|
185 |
+
"bus_stop": {
|
186 |
+
"name": "Bus Stop",
|
187 |
+
"required_objects": [0, 5], # person, bus
|
188 |
+
"optional_objects": [1, 2, 7, 24], # bicycle, car, truck, backpack
|
189 |
+
"minimum_required": 2,
|
190 |
+
"description": "A roadside bus stop with waiting passengers and buses"
|
191 |
+
},
|
192 |
+
"bus_station": {
|
193 |
+
"name": "Bus Station",
|
194 |
+
"required_objects": [0, 5, 7], # person, bus, truck
|
195 |
+
"optional_objects": [24, 28, 67], # backpack, suitcase, cell phone
|
196 |
+
"minimum_required": 2,
|
197 |
+
"description": "A bus terminal with multiple buses and travelers"
|
198 |
+
},
|
199 |
+
"zoo": {
|
200 |
+
"name": "Zoo",
|
201 |
+
"required_objects": [20, 22, 23], # elephant, zebra, giraffe
|
202 |
+
"optional_objects": [0, 14, 16], # person, bird, dog
|
203 |
+
"minimum_required": 2,
|
204 |
+
"description": "A zoo environment featuring large animal exhibits and visitors"
|
205 |
+
},
|
206 |
+
"harbor": {
|
207 |
+
"name": "Harbor",
|
208 |
+
"required_objects": [8], # boat
|
209 |
+
"optional_objects": [0, 2, 3, 39], # person, car, motorcycle, bottle
|
210 |
+
"minimum_required": 1,
|
211 |
+
"description": "A harbor area with boats docked and surrounding traffic"
|
212 |
+
},
|
213 |
+
"playground": {
|
214 |
+
"name": "Playground",
|
215 |
+
"required_objects": [0, 32], # person, sports ball
|
216 |
+
"optional_objects": [33, 24, 1], # kite, backpack, bicycle
|
217 |
+
"minimum_required": 1,
|
218 |
+
"description": "An outdoor playground with people playing sports and games"
|
219 |
+
},
|
220 |
+
"sports_field": {
|
221 |
+
"name": "Sports Field",
|
222 |
+
"required_objects": [32], # sports ball
|
223 |
+
"optional_objects": [38, 34, 35], # tennis racket, baseball bat, baseball glove
|
224 |
+
"minimum_required": 1,
|
225 |
+
"description": "A sports field set up for various ball games"
|
226 |
+
},
|
227 |
+
"narrow_commercial_alley": {
|
228 |
+
"name": "Narrow Commercial Alley",
|
229 |
+
"required_objects": [0, 3], # person, motorcycle
|
230 |
+
"optional_objects": [2, 7, 24, 26], # car, truck, backpack, handbag
|
231 |
+
"minimum_required": 2,
|
232 |
+
"description": "A tight urban alley lined with shops, with pedestrians and light vehicles"
|
233 |
+
},
|
234 |
+
"daytime_shopping_street": {
|
235 |
+
"name": "Daytime Shopping Street",
|
236 |
+
"required_objects": [0, 2], # person, car
|
237 |
+
"optional_objects": [1, 3, 24, 26], # bicycle, motorcycle, backpack, handbag
|
238 |
+
"minimum_required": 2,
|
239 |
+
"description": "A busy pedestrian street during daytime, featuring shops, vehicles, and shoppers"
|
240 |
+
},
|
241 |
+
"urban_pedestrian_crossing": {
|
242 |
+
"name": "Urban Pedestrian Crossing",
|
243 |
+
"required_objects": [0, 9], # person, traffic light
|
244 |
+
"optional_objects": [2, 3, 5], # car, motorcycle, bus
|
245 |
+
"minimum_required": 1,
|
246 |
+
"description": "A city street crossing with pedestrians and traffic signals"
|
247 |
+
},
|
248 |
+
"aerial_view_intersection": {
|
249 |
+
"name": "Aerial View Intersection",
|
250 |
+
"required_objects": [0, 9], # person, traffic light
|
251 |
+
"optional_objects": [1, 2, 3, 5, 7], # bicycle, car, motorcycle, bus, truck
|
252 |
+
"minimum_required": 1,
|
253 |
+
"description": "An intersection viewed from above, showing crossing patterns and pedestrian movement"
|
254 |
+
},
|
255 |
+
"aerial_view_commercial_area": {
|
256 |
+
"name": "Aerial View Commercial Area",
|
257 |
+
"required_objects": [0, 2], # person, car
|
258 |
+
"optional_objects": [1, 3, 5, 7, 24, 26], # bicycle, motorcycle, bus, truck, backpack, handbag
|
259 |
+
"minimum_required": 2,
|
260 |
+
"description": "A commercial or shopping area viewed from above showing pedestrians and urban layout"
|
261 |
+
},
|
262 |
+
"aerial_view_plaza": {
|
263 |
+
"name": "Aerial View Plaza",
|
264 |
+
"required_objects": [0], # person
|
265 |
+
"optional_objects": [1, 2, 24, 25, 26], # bicycle, car, backpack, umbrella, handbag
|
266 |
+
"minimum_required": 1,
|
267 |
+
"description": "An urban plaza or public square viewed from above with pedestrian activity"
|
268 |
+
},
|
269 |
+
|
270 |
+
# specific cultural item
|
271 |
+
"asian_night_market": {
|
272 |
+
"name": "Asian Night Market",
|
273 |
+
"required_objects": [0, 67], # person, cell phone
|
274 |
+
"optional_objects": [1, 3, 24, 26, 39, 41], # bicycle, motorcycle, backpack, handbag, bottle, cup
|
275 |
+
"minimum_required": 1,
|
276 |
+
"description": "A vibrant night market scene typical in Asian cities with food stalls and crowds"
|
277 |
+
},
|
278 |
+
"asian_temple_area": {
|
279 |
+
"name": "Asian Temple Area",
|
280 |
+
"required_objects": [0], # person
|
281 |
+
"optional_objects": [24, 25, 26, 67, 75], # backpack, umbrella, handbag, cell phone, vase
|
282 |
+
"minimum_required": 1,
|
283 |
+
"description": "A traditional Asian temple complex with visitors and cultural elements"
|
284 |
+
},
|
285 |
+
"european_plaza": {
|
286 |
+
"name": "European Plaza",
|
287 |
+
"required_objects": [0], # person
|
288 |
+
"optional_objects": [1, 2, 4, 9, 24, 26, 67], # bicycle, car, airplane, traffic light, backpack, handbag, cell phone
|
289 |
+
"minimum_required": 1,
|
290 |
+
"description": "A European-style city plaza with historic architecture and pedestrian activity"
|
291 |
+
},
|
292 |
+
|
293 |
+
# specific time item
|
294 |
+
"nighttime_street": {
|
295 |
+
"name": "Nighttime Street",
|
296 |
+
"required_objects": [0, 9], # person, traffic light
|
297 |
+
"optional_objects": [1, 2, 3, 5, 7, 67], # bicycle, car, motorcycle, bus, truck, cell phone
|
298 |
+
"minimum_required": 1,
|
299 |
+
"description": "An urban street at night with artificial lighting and nighttime activity"
|
300 |
+
},
|
301 |
+
"nighttime_commercial_district": {
|
302 |
+
"name": "Nighttime Commercial District",
|
303 |
+
"required_objects": [0, 67], # person, cell phone
|
304 |
+
"optional_objects": [1, 2, 3, 24, 26], # bicycle, car, motorcycle, backpack, handbag
|
305 |
+
"minimum_required": 1,
|
306 |
+
"description": "A commercial district illuminated at night with neon signs and evening activity"
|
307 |
+
},
|
308 |
+
|
309 |
+
# mixture enviroment item
|
310 |
+
"indoor_outdoor_cafe": {
|
311 |
+
"name": "Indoor-Outdoor Cafe",
|
312 |
+
"required_objects": [56, 60, 41], # chair, dining table, cup
|
313 |
+
"optional_objects": [39, 40, 63, 67, 73], # bottle, wine glass, laptop, cell phone, book
|
314 |
+
"minimum_required": 2,
|
315 |
+
"description": "A cafe setting with both indoor elements and outdoor patio or sidewalk seating"
|
316 |
+
},
|
317 |
+
"transit_station_platform": {
|
318 |
+
"name": "Transit Station Platform",
|
319 |
+
"required_objects": [0], # person
|
320 |
+
"optional_objects": [5, 6, 7, 24, 28, 67], # bus, train, truck, backpack, suitcase, cell phone
|
321 |
+
"minimum_required": 1,
|
322 |
+
"description": "A transit platform with waiting passengers and arriving/departing vehicles"
|
323 |
+
},
|
324 |
+
"sports_stadium": {
|
325 |
+
"name": "Sports Stadium",
|
326 |
+
"required_objects": [0, 32], # person, sports ball
|
327 |
+
"optional_objects": [24, 38, 39, 41, 67], # backpack, tennis racket, bottle, cup, cell phone
|
328 |
+
"minimum_required": 1,
|
329 |
+
"description": "A sports stadium or arena with spectators and athletic activities"
|
330 |
+
},
|
331 |
+
"construction_site": {
|
332 |
+
"name": "Construction Site",
|
333 |
+
"required_objects": [0, 7], # person, truck
|
334 |
+
"optional_objects": [2, 3, 11, 76, 77, 78], # car, motorcycle, fire hydrant, scissors, teddy bear, hair drier
|
335 |
+
"minimum_required": 1,
|
336 |
+
"description": "A construction site with workers, equipment, and building materials"
|
337 |
+
},
|
338 |
+
"medical_facility": {
|
339 |
+
"name": "Medical Facility",
|
340 |
+
"required_objects": [0, 56, 60], # person, chair, dining table
|
341 |
+
"optional_objects": [63, 64, 66, 67, 73], # laptop, mouse, keyboard, cell phone, book
|
342 |
+
"minimum_required": 2,
|
343 |
+
"description": "A medical facility such as hospital, clinic or doctor's office with medical staff and patients"
|
344 |
+
},
|
345 |
+
"educational_setting": {
|
346 |
+
"name": "Educational Setting",
|
347 |
+
"required_objects": [0, 56, 60, 73], # person, chair, dining table, book
|
348 |
+
"optional_objects": [63, 64, 66, 67, 74], # laptop, mouse, keyboard, cell phone, clock
|
349 |
+
"minimum_required": 2,
|
350 |
+
"description": "An educational environment such as classroom, lecture hall or study area"
|
351 |
+
},
|
352 |
+
"aerial_view_intersection": {
|
353 |
+
"name": "Aerial View Intersection",
|
354 |
+
"required_objects": [0, 9], # person, traffic light
|
355 |
+
"optional_objects": [1, 2, 3, 5, 7], # bicycle, car, motorcycle, bus, truck
|
356 |
+
"minimum_required": 1,
|
357 |
+
"description": "An intersection viewed from above, showing crossing patterns and pedestrian movement",
|
358 |
+
"viewpoint_indicator": "aerial", # view side
|
359 |
+
"key_features": ["crosswalk_pattern", "pedestrian_flow", "intersection_layout"], # key feature
|
360 |
+
"detection_priority": 10 # priority
|
361 |
+
},
|
362 |
+
"perpendicular_crosswalk_intersection": {
|
363 |
+
"name": "Perpendicular Crosswalk Intersection",
|
364 |
+
"required_objects": [0], # person
|
365 |
+
"optional_objects": [1, 2, 3, 5, 7, 9], # bicycle, car, motorcycle, bus, truck, traffic light
|
366 |
+
"minimum_required": 1,
|
367 |
+
"description": "An intersection with perpendicular crosswalks where pedestrians cross in multiple directions",
|
368 |
+
"viewpoint_indicator": "aerial",
|
369 |
+
"key_features": ["perpendicular_crosswalks", "pedestrian_crossing", "multi_directional_movement"],
|
370 |
+
"pattern_detection": True, # specific pattern
|
371 |
+
"detection_priority": 15 #
|
372 |
+
},
|
373 |
+
"beach_water_recreation": {
|
374 |
+
"name": "Beach/Water Recreation Area",
|
375 |
+
"required_objects": [0, 37], # person, surfboard
|
376 |
+
"optional_objects": [25, 33, 1, 8, 29, 24, 26, 39, 41], # umbrella, kite, bicycle, boat, frisbee, backpack, handbag, bottle, cup
|
377 |
+
"minimum_required": 2,
|
378 |
+
"description": "A beach or water recreation area with water sports equipment and beach accessories"
|
379 |
+
},
|
380 |
+
"sports_venue": {
|
381 |
+
"name": "Sports Venue",
|
382 |
+
"required_objects": [0, 32], # person, sports ball
|
383 |
+
"optional_objects": [34, 35, 38, 25, 24, 26, 39, 41], # baseball bat, baseball glove, tennis racket, umbrella, backpack, handbag, bottle, cup
|
384 |
+
"minimum_required": 2,
|
385 |
+
"description": "A professional sports venue with specialized sports equipment and spectator areas"
|
386 |
+
},
|
387 |
+
"professional_kitchen": {
|
388 |
+
"name": "Professional Kitchen",
|
389 |
+
"required_objects": [43, 44, 45], # knife, spoon, bowl
|
390 |
+
"optional_objects": [42, 39, 41, 68, 69, 71, 72, 0], # fork, bottle, cup, microwave, oven, sink, refrigerator, person
|
391 |
+
"minimum_required": 3,
|
392 |
+
"description": "A commercial kitchen with professional cooking equipment and food preparation areas"
|
393 |
+
},
|
394 |
+
}
|
spatial_analyzer.py
ADDED
@@ -0,0 +1,1444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import numpy as np
|
4 |
+
from typing import Dict, List, Tuple, Any, Optional
|
5 |
+
|
6 |
+
from scene_type import SCENE_TYPES
|
7 |
+
from enhance_descriptor import EnhancedSceneDescriber
|
8 |
+
|
9 |
+
class SpatialAnalyzer:
|
10 |
+
"""
|
11 |
+
Analyzes spatial relationships between objects in an image.
|
12 |
+
Handles region assignment, object positioning, and functional zone identification.
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, class_names: Dict[int, str] = None, object_categories=None):
|
16 |
+
"""Initialize the spatial analyzer with image regions"""
|
17 |
+
# Define regions of the image (3x3 grid)
|
18 |
+
self.regions = {
|
19 |
+
"top_left": (0, 0, 1/3, 1/3),
|
20 |
+
"top_center": (1/3, 0, 2/3, 1/3),
|
21 |
+
"top_right": (2/3, 0, 1, 1/3),
|
22 |
+
"middle_left": (0, 1/3, 1/3, 2/3),
|
23 |
+
"middle_center": (1/3, 1/3, 2/3, 2/3),
|
24 |
+
"middle_right": (2/3, 1/3, 1, 2/3),
|
25 |
+
"bottom_left": (0, 2/3, 1/3, 1),
|
26 |
+
"bottom_center": (1/3, 2/3, 2/3, 1),
|
27 |
+
"bottom_right": (2/3, 2/3, 1, 1)
|
28 |
+
}
|
29 |
+
|
30 |
+
self.class_names = class_names
|
31 |
+
self.OBJECT_CATEGORIES = object_categories or {}
|
32 |
+
self.enhance_descriptor = EnhancedSceneDescriber(scene_types=SCENE_TYPES)
|
33 |
+
|
34 |
+
# Distances thresholds for proximity analysis (normalized)
|
35 |
+
self.proximity_threshold = 0.2
|
36 |
+
|
37 |
+
|
38 |
+
def _determine_region(self, x: float, y: float) -> str:
|
39 |
+
"""
|
40 |
+
Determine which region a point falls into.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
x: Normalized x-coordinate (0-1)
|
44 |
+
y: Normalized y-coordinate (0-1)
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
Region name
|
48 |
+
"""
|
49 |
+
for region_name, (x1, y1, x2, y2) in self.regions.items():
|
50 |
+
if x1 <= x < x2 and y1 <= y < y2:
|
51 |
+
return region_name
|
52 |
+
|
53 |
+
return "unknown"
|
54 |
+
|
55 |
+
def _analyze_regions(self, detected_objects: List[Dict]) -> Dict:
|
56 |
+
"""
|
57 |
+
Analyze object distribution across image regions.
|
58 |
+
|
59 |
+
Args:
|
60 |
+
detected_objects: List of detected objects with position information
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
Dictionary with region analysis
|
64 |
+
"""
|
65 |
+
# Count objects in each region
|
66 |
+
region_counts = {region: 0 for region in self.regions.keys()}
|
67 |
+
region_objects = {region: [] for region in self.regions.keys()}
|
68 |
+
|
69 |
+
for obj in detected_objects:
|
70 |
+
region = obj["region"]
|
71 |
+
if region in region_counts:
|
72 |
+
region_counts[region] += 1
|
73 |
+
region_objects[region].append({
|
74 |
+
"class_id": obj["class_id"],
|
75 |
+
"class_name": obj["class_name"]
|
76 |
+
})
|
77 |
+
|
78 |
+
# Determine main focus regions (top 1-2 regions by object count)
|
79 |
+
sorted_regions = sorted(region_counts.items(), key=lambda x: x[1], reverse=True)
|
80 |
+
main_regions = [region for region, count in sorted_regions if count > 0][:2]
|
81 |
+
|
82 |
+
return {
|
83 |
+
"counts": region_counts,
|
84 |
+
"main_focus": main_regions,
|
85 |
+
"objects_by_region": region_objects
|
86 |
+
}
|
87 |
+
|
88 |
+
def _extract_detected_objects(self, detection_result: Any, confidence_threshold: float = 0.25) -> List[Dict]:
|
89 |
+
"""
|
90 |
+
Extract detected objects from detection result with position information.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
detection_result: Detection result from YOLOv8
|
94 |
+
confidence_threshold: Minimum confidence threshold
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
List of dictionaries with detected object information
|
98 |
+
"""
|
99 |
+
boxes = detection_result.boxes.xyxy.cpu().numpy()
|
100 |
+
classes = detection_result.boxes.cls.cpu().numpy().astype(int)
|
101 |
+
confidences = detection_result.boxes.conf.cpu().numpy()
|
102 |
+
|
103 |
+
# Image dimensions
|
104 |
+
img_height, img_width = detection_result.orig_shape[:2]
|
105 |
+
|
106 |
+
detected_objects = []
|
107 |
+
for box, class_id, confidence in zip(boxes, classes, confidences):
|
108 |
+
# Skip objects with confidence below threshold
|
109 |
+
if confidence < confidence_threshold:
|
110 |
+
continue
|
111 |
+
|
112 |
+
x1, y1, x2, y2 = box
|
113 |
+
width = x2 - x1
|
114 |
+
height = y2 - y1
|
115 |
+
|
116 |
+
# Center point
|
117 |
+
center_x = (x1 + x2) / 2
|
118 |
+
center_y = (y1 + y2) / 2
|
119 |
+
|
120 |
+
# Normalized positions (0-1)
|
121 |
+
norm_x = center_x / img_width
|
122 |
+
norm_y = center_y / img_height
|
123 |
+
norm_width = width / img_width
|
124 |
+
norm_height = height / img_height
|
125 |
+
|
126 |
+
# Area calculation
|
127 |
+
area = width * height
|
128 |
+
norm_area = area / (img_width * img_height)
|
129 |
+
|
130 |
+
# Region determination
|
131 |
+
object_region = self._determine_region(norm_x, norm_y)
|
132 |
+
|
133 |
+
detected_objects.append({
|
134 |
+
"class_id": int(class_id),
|
135 |
+
"class_name": self.class_names[int(class_id)],
|
136 |
+
"confidence": float(confidence),
|
137 |
+
"box": [float(x1), float(y1), float(x2), float(y2)],
|
138 |
+
"center": [float(center_x), float(center_y)],
|
139 |
+
"normalized_center": [float(norm_x), float(norm_y)],
|
140 |
+
"size": [float(width), float(height)],
|
141 |
+
"normalized_size": [float(norm_width), float(norm_height)],
|
142 |
+
"area": float(area),
|
143 |
+
"normalized_area": float(norm_area),
|
144 |
+
"region": object_region
|
145 |
+
})
|
146 |
+
|
147 |
+
return detected_objects
|
148 |
+
|
149 |
+
|
150 |
+
def _detect_scene_viewpoint(self, detected_objects: List[Dict]) -> Dict:
|
151 |
+
"""
|
152 |
+
檢測場景視角並識別特殊場景模式。
|
153 |
+
|
154 |
+
Args:
|
155 |
+
detected_objects: 檢測到的物體列表
|
156 |
+
|
157 |
+
Returns:
|
158 |
+
Dict: 包含視角和場景模式信息的字典
|
159 |
+
"""
|
160 |
+
if not detected_objects:
|
161 |
+
return {"viewpoint": "eye_level", "patterns": []}
|
162 |
+
|
163 |
+
# 從物體位置中提取信息
|
164 |
+
patterns = []
|
165 |
+
|
166 |
+
# 檢測行人位置模式
|
167 |
+
pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
168 |
+
|
169 |
+
# 檢查是否有足夠的行人來識別模式
|
170 |
+
if len(pedestrian_objs) >= 4:
|
171 |
+
pedestrian_positions = [obj["normalized_center"] for obj in pedestrian_objs]
|
172 |
+
|
173 |
+
# 檢測十字交叉模式
|
174 |
+
if self._detect_cross_pattern(pedestrian_positions):
|
175 |
+
patterns.append("crosswalk_intersection")
|
176 |
+
|
177 |
+
# 檢測多方向行人流
|
178 |
+
directions = self._analyze_movement_directions(pedestrian_positions)
|
179 |
+
if len(directions) >= 2:
|
180 |
+
patterns.append("multi_directional_movement")
|
181 |
+
|
182 |
+
# 檢查物體的大小一致性 - 在空中俯視圖中,物體大小通常更一致
|
183 |
+
if len(detected_objects) >= 5:
|
184 |
+
sizes = [obj.get("normalized_area", 0) for obj in detected_objects]
|
185 |
+
size_variance = np.var(sizes) / (np.mean(sizes) ** 2) # 標準化變異數,不會受到平均值影響
|
186 |
+
|
187 |
+
if size_variance < 0.3: # 低變異表示大小一致
|
188 |
+
patterns.append("consistent_object_size")
|
189 |
+
|
190 |
+
# 基本視角檢測
|
191 |
+
viewpoint = self.enhance_descriptor._detect_viewpoint(detected_objects)
|
192 |
+
|
193 |
+
# 根據檢測到的模式增強視角判斷
|
194 |
+
if "crosswalk_intersection" in patterns and viewpoint != "aerial":
|
195 |
+
# 如果檢測到斑馬線交叉但視角判斷不是空中視角,優先採用模式判斷
|
196 |
+
viewpoint = "aerial"
|
197 |
+
|
198 |
+
return {
|
199 |
+
"viewpoint": viewpoint,
|
200 |
+
"patterns": patterns
|
201 |
+
}
|
202 |
+
|
203 |
+
def _detect_cross_pattern(self, positions):
|
204 |
+
"""
|
205 |
+
檢測位置中的十字交叉模式
|
206 |
+
|
207 |
+
Args:
|
208 |
+
positions: 位置列表 [[x1, y1], [x2, y2], ...]
|
209 |
+
|
210 |
+
Returns:
|
211 |
+
bool: 是否檢測到十字交叉模式
|
212 |
+
"""
|
213 |
+
if len(positions) < 8: # 需要足夠多的點
|
214 |
+
return False
|
215 |
+
|
216 |
+
# 提取 x 和 y 坐標
|
217 |
+
x_coords = [pos[0] for pos in positions]
|
218 |
+
y_coords = [pos[1] for pos in positions]
|
219 |
+
|
220 |
+
# 檢測 x 和 y 方向的聚類
|
221 |
+
x_clusters = []
|
222 |
+
y_clusters = []
|
223 |
+
|
224 |
+
# 簡化的聚類分析
|
225 |
+
x_mean = np.mean(x_coords)
|
226 |
+
y_mean = np.mean(y_coords)
|
227 |
+
|
228 |
+
# 計算在中心線附近的點
|
229 |
+
near_x_center = sum(1 for x in x_coords if abs(x - x_mean) < 0.1)
|
230 |
+
near_y_center = sum(1 for y in y_coords if abs(y - y_mean) < 0.1)
|
231 |
+
|
232 |
+
# 如果有足夠的點在中心線附近,可能是十字交叉
|
233 |
+
return near_x_center >= 3 and near_y_center >= 3
|
234 |
+
|
235 |
+
def _analyze_movement_directions(self, positions):
|
236 |
+
"""
|
237 |
+
分析位置中的移動方向
|
238 |
+
|
239 |
+
Args:
|
240 |
+
positions: 位置列表 [[x1, y1], [x2, y2], ...]
|
241 |
+
|
242 |
+
Returns:
|
243 |
+
list: 檢測到的主要方向
|
244 |
+
"""
|
245 |
+
if len(positions) < 6:
|
246 |
+
return []
|
247 |
+
|
248 |
+
# extract x 和 y 坐標
|
249 |
+
x_coords = [pos[0] for pos in positions]
|
250 |
+
y_coords = [pos[1] for pos in positions]
|
251 |
+
|
252 |
+
directions = []
|
253 |
+
|
254 |
+
# horizontal move (left --> right)
|
255 |
+
x_std = np.std(x_coords)
|
256 |
+
x_range = max(x_coords) - min(x_coords)
|
257 |
+
|
258 |
+
# vertical move(up --> down)
|
259 |
+
y_std = np.std(y_coords)
|
260 |
+
y_range = max(y_coords) - min(y_coords)
|
261 |
+
|
262 |
+
# 足夠大的範圍表示該方向有運動
|
263 |
+
if x_range > 0.4:
|
264 |
+
directions.append("horizontal")
|
265 |
+
if y_range > 0.4:
|
266 |
+
directions.append("vertical")
|
267 |
+
|
268 |
+
return directions
|
269 |
+
|
270 |
+
def _identify_functional_zones(self, detected_objects: List[Dict], scene_type: str) -> Dict:
|
271 |
+
"""
|
272 |
+
Identify functional zones within the scene with improved detection for different viewpoints
|
273 |
+
and cultural contexts.
|
274 |
+
|
275 |
+
Args:
|
276 |
+
detected_objects: List of detected objects
|
277 |
+
scene_type: Identified scene type
|
278 |
+
|
279 |
+
Returns:
|
280 |
+
Dictionary of functional zones with their descriptions
|
281 |
+
"""
|
282 |
+
# Group objects by category and region
|
283 |
+
category_regions = {}
|
284 |
+
|
285 |
+
for obj in detected_objects:
|
286 |
+
# Find object category
|
287 |
+
category = "other"
|
288 |
+
for cat_name, cat_ids in self.OBJECT_CATEGORIES.items():
|
289 |
+
if obj["class_id"] in cat_ids:
|
290 |
+
category = cat_name
|
291 |
+
break
|
292 |
+
|
293 |
+
# Add to category-region mapping
|
294 |
+
if category not in category_regions:
|
295 |
+
category_regions[category] = {}
|
296 |
+
|
297 |
+
region = obj["region"]
|
298 |
+
if region not in category_regions[category]:
|
299 |
+
category_regions[category][region] = []
|
300 |
+
|
301 |
+
category_regions[category][region].append(obj)
|
302 |
+
|
303 |
+
# Identify zones based on object groupings
|
304 |
+
zones = {}
|
305 |
+
|
306 |
+
# Detect viewpoint to adjust zone identification strategy
|
307 |
+
viewpoint = self._detect_scene_viewpoint(detected_objects)
|
308 |
+
|
309 |
+
# Choose appropriate zone identification strategy based on scene type and viewpoint
|
310 |
+
if scene_type in ["living_room", "bedroom", "dining_area", "kitchen", "office_workspace", "meeting_room"]:
|
311 |
+
# Indoor scenes
|
312 |
+
zones.update(self._identify_indoor_zones(category_regions, detected_objects, scene_type))
|
313 |
+
elif scene_type in ["city_street", "parking_lot", "park_area"]:
|
314 |
+
# Outdoor general scenes
|
315 |
+
zones.update(self._identify_outdoor_general_zones(category_regions, detected_objects, scene_type))
|
316 |
+
elif "aerial" in scene_type or viewpoint == "aerial":
|
317 |
+
# Aerial viewpoint scenes
|
318 |
+
zones.update(self._identify_aerial_view_zones(category_regions, detected_objects, scene_type))
|
319 |
+
elif "asian" in scene_type:
|
320 |
+
# Asian cultural context scenes
|
321 |
+
zones.update(self._identify_asian_cultural_zones(category_regions, detected_objects, scene_type))
|
322 |
+
elif scene_type == "urban_intersection":
|
323 |
+
# Specific urban intersection logic
|
324 |
+
zones.update(self._identify_intersection_zones(category_regions, detected_objects, viewpoint))
|
325 |
+
elif scene_type == "financial_district":
|
326 |
+
# Financial district specific logic
|
327 |
+
zones.update(self._identify_financial_district_zones(category_regions, detected_objects))
|
328 |
+
elif scene_type == "upscale_dining":
|
329 |
+
# Upscale dining specific logic
|
330 |
+
zones.update(self._identify_upscale_dining_zones(category_regions, detected_objects))
|
331 |
+
else:
|
332 |
+
# Default zone identification for other scene types
|
333 |
+
zones.update(self._identify_default_zones(category_regions, detected_objects))
|
334 |
+
|
335 |
+
# If no zones were identified, try the default approach
|
336 |
+
if not zones:
|
337 |
+
zones.update(self._identify_default_zones(category_regions, detected_objects))
|
338 |
+
|
339 |
+
return zones
|
340 |
+
|
341 |
+
def _identify_indoor_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
|
342 |
+
"""
|
343 |
+
Identify functional zones for indoor scenes.
|
344 |
+
|
345 |
+
Args:
|
346 |
+
category_regions: Objects grouped by category and region
|
347 |
+
detected_objects: List of detected objects
|
348 |
+
scene_type: Specific indoor scene type
|
349 |
+
|
350 |
+
Returns:
|
351 |
+
Dict: Indoor functional zones
|
352 |
+
"""
|
353 |
+
zones = {}
|
354 |
+
|
355 |
+
# Seating/social zone
|
356 |
+
if "furniture" in category_regions:
|
357 |
+
furniture_regions = category_regions["furniture"]
|
358 |
+
main_furniture_region = max(furniture_regions.items(),
|
359 |
+
key=lambda x: len(x[1]),
|
360 |
+
default=(None, []))
|
361 |
+
|
362 |
+
if main_furniture_region[0] is not None and len(main_furniture_region[1]) >= 2:
|
363 |
+
zone_objects = [obj["class_name"] for obj in main_furniture_region[1]]
|
364 |
+
zones["social_zone"] = {
|
365 |
+
"region": main_furniture_region[0],
|
366 |
+
"objects": zone_objects,
|
367 |
+
"description": f"Social or seating area with {', '.join(zone_objects)}"
|
368 |
+
}
|
369 |
+
|
370 |
+
# Entertainment zone
|
371 |
+
if "electronics" in category_regions:
|
372 |
+
electronics_items = []
|
373 |
+
for region_objects in category_regions["electronics"].values():
|
374 |
+
electronics_items.extend([obj["class_name"] for obj in region_objects])
|
375 |
+
|
376 |
+
if electronics_items:
|
377 |
+
zones["entertainment_zone"] = {
|
378 |
+
"region": self._find_main_region(category_regions.get("electronics", {})),
|
379 |
+
"objects": electronics_items,
|
380 |
+
"description": f"Entertainment or media area with {', '.join(electronics_items)}"
|
381 |
+
}
|
382 |
+
|
383 |
+
# Dining/food zone
|
384 |
+
food_zone_categories = ["kitchen_items", "food"]
|
385 |
+
food_items = []
|
386 |
+
food_regions = {}
|
387 |
+
|
388 |
+
for category in food_zone_categories:
|
389 |
+
if category in category_regions:
|
390 |
+
for region, objects in category_regions[category].items():
|
391 |
+
if region not in food_regions:
|
392 |
+
food_regions[region] = []
|
393 |
+
food_regions[region].extend(objects)
|
394 |
+
food_items.extend([obj["class_name"] for obj in objects])
|
395 |
+
|
396 |
+
if food_items:
|
397 |
+
main_food_region = max(food_regions.items(),
|
398 |
+
key=lambda x: len(x[1]),
|
399 |
+
default=(None, []))
|
400 |
+
|
401 |
+
if main_food_region[0] is not None:
|
402 |
+
zones["dining_zone"] = {
|
403 |
+
"region": main_food_region[0],
|
404 |
+
"objects": list(set(food_items)),
|
405 |
+
"description": f"Dining or food preparation area with {', '.join(list(set(food_items))[:3])}"
|
406 |
+
}
|
407 |
+
|
408 |
+
# Work/study zone - enhanced to detect even when scene_type is not explicitly office
|
409 |
+
work_items = []
|
410 |
+
work_regions = {}
|
411 |
+
|
412 |
+
for obj in detected_objects:
|
413 |
+
if obj["class_id"] in [56, 60, 63, 64, 66, 73]: # chair, table, laptop, mouse, keyboard, book
|
414 |
+
region = obj["region"]
|
415 |
+
if region not in work_regions:
|
416 |
+
work_regions[region] = []
|
417 |
+
work_regions[region].append(obj)
|
418 |
+
work_items.append(obj["class_name"])
|
419 |
+
|
420 |
+
# Check for laptop and table/chair combinations that suggest a workspace
|
421 |
+
has_laptop = any(obj["class_id"] == 63 for obj in detected_objects)
|
422 |
+
has_keyboard = any(obj["class_id"] == 66 for obj in detected_objects)
|
423 |
+
has_table = any(obj["class_id"] == 60 for obj in detected_objects)
|
424 |
+
has_chair = any(obj["class_id"] == 56 for obj in detected_objects)
|
425 |
+
|
426 |
+
# If we have electronics with furniture in the same region, likely a workspace
|
427 |
+
workspace_detected = (has_laptop or has_keyboard) and (has_table or has_chair)
|
428 |
+
|
429 |
+
if (workspace_detected or scene_type in ["office_workspace", "meeting_room"]) and work_items:
|
430 |
+
main_work_region = max(work_regions.items(),
|
431 |
+
key=lambda x: len(x[1]),
|
432 |
+
default=(None, []))
|
433 |
+
|
434 |
+
if main_work_region[0] is not None:
|
435 |
+
zones["workspace_zone"] = {
|
436 |
+
"region": main_work_region[0],
|
437 |
+
"objects": list(set(work_items)),
|
438 |
+
"description": f"Work or study area with {', '.join(list(set(work_items))[:3])}"
|
439 |
+
}
|
440 |
+
|
441 |
+
# Bedroom-specific zones
|
442 |
+
if scene_type == "bedroom":
|
443 |
+
bed_objects = [obj for obj in detected_objects if obj["class_id"] == 59] # Bed
|
444 |
+
if bed_objects:
|
445 |
+
bed_region = bed_objects[0]["region"]
|
446 |
+
zones["sleeping_zone"] = {
|
447 |
+
"region": bed_region,
|
448 |
+
"objects": ["bed"],
|
449 |
+
"description": "Sleeping area with bed"
|
450 |
+
}
|
451 |
+
|
452 |
+
# Kitchen-specific zones
|
453 |
+
if scene_type == "kitchen":
|
454 |
+
# Look for appliances (refrigerator, oven, microwave, sink)
|
455 |
+
appliance_ids = [68, 69, 71, 72] # microwave, oven, sink, refrigerator
|
456 |
+
appliance_objects = [obj for obj in detected_objects if obj["class_id"] in appliance_ids]
|
457 |
+
|
458 |
+
if appliance_objects:
|
459 |
+
appliance_regions = {}
|
460 |
+
for obj in appliance_objects:
|
461 |
+
region = obj["region"]
|
462 |
+
if region not in appliance_regions:
|
463 |
+
appliance_regions[region] = []
|
464 |
+
appliance_regions[region].append(obj)
|
465 |
+
|
466 |
+
if appliance_regions:
|
467 |
+
main_appliance_region = max(appliance_regions.items(),
|
468 |
+
key=lambda x: len(x[1]),
|
469 |
+
default=(None, []))
|
470 |
+
|
471 |
+
if main_appliance_region[0] is not None:
|
472 |
+
appliance_names = [obj["class_name"] for obj in main_appliance_region[1]]
|
473 |
+
zones["kitchen_appliance_zone"] = {
|
474 |
+
"region": main_appliance_region[0],
|
475 |
+
"objects": appliance_names,
|
476 |
+
"description": f"Kitchen appliance area with {', '.join(appliance_names)}"
|
477 |
+
}
|
478 |
+
|
479 |
+
return zones
|
480 |
+
|
481 |
+
def _identify_intersection_zones(self, category_regions: Dict, detected_objects: List[Dict], viewpoint: str) -> Dict:
|
482 |
+
"""
|
483 |
+
Identify functional zones for urban intersections with enhanced spatial awareness.
|
484 |
+
|
485 |
+
Args:
|
486 |
+
category_regions: Objects grouped by category and region
|
487 |
+
detected_objects: List of detected objects
|
488 |
+
viewpoint: Detected viewpoint
|
489 |
+
|
490 |
+
Returns:
|
491 |
+
Dict: Refined intersection functional zones
|
492 |
+
"""
|
493 |
+
zones = {}
|
494 |
+
|
495 |
+
# Get pedestrians, vehicles and traffic signals
|
496 |
+
pedestrian_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
497 |
+
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 7]] # bicycle, car, motorcycle, bus, truck
|
498 |
+
traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
|
499 |
+
|
500 |
+
# Create distribution maps for better spatial understanding
|
501 |
+
regions_distribution = self._create_distribution_map(detected_objects)
|
502 |
+
|
503 |
+
# Analyze pedestrian crossing patterns
|
504 |
+
crossing_zones = self._analyze_crossing_patterns(pedestrian_objs, traffic_light_objs, regions_distribution)
|
505 |
+
zones.update(crossing_zones)
|
506 |
+
|
507 |
+
# Analyze vehicle traffic zones with directional awareness
|
508 |
+
traffic_zones = self._analyze_traffic_zones(vehicle_objs, regions_distribution)
|
509 |
+
zones.update(traffic_zones)
|
510 |
+
|
511 |
+
# Identify traffic control zones based on signal placement
|
512 |
+
if traffic_light_objs:
|
513 |
+
# Group traffic lights by region for better organization
|
514 |
+
signal_regions = {}
|
515 |
+
for obj in traffic_light_objs:
|
516 |
+
region = obj["region"]
|
517 |
+
if region not in signal_regions:
|
518 |
+
signal_regions[region] = []
|
519 |
+
signal_regions[region].append(obj)
|
520 |
+
|
521 |
+
# Create traffic control zones for each region with signals
|
522 |
+
for idx, (region, signals) in enumerate(signal_regions.items()):
|
523 |
+
# Check if this region has a directional name
|
524 |
+
direction = self._get_directional_description(region)
|
525 |
+
|
526 |
+
zones[f"traffic_control_zone_{idx+1}"] = {
|
527 |
+
"region": region,
|
528 |
+
"objects": ["traffic light"] * len(signals),
|
529 |
+
"description": f"Traffic control area with {len(signals)} traffic signals" +
|
530 |
+
(f" in {direction} area" if direction else "")
|
531 |
+
}
|
532 |
+
|
533 |
+
return zones
|
534 |
+
|
535 |
+
def _analyze_crossing_patterns(self, pedestrians: List[Dict], traffic_lights: List[Dict],
|
536 |
+
region_distribution: Dict) -> Dict:
|
537 |
+
"""
|
538 |
+
Analyze pedestrian crossing patterns to identify crosswalk zones.
|
539 |
+
|
540 |
+
Args:
|
541 |
+
pedestrians: List of pedestrian objects
|
542 |
+
traffic_lights: List of traffic light objects
|
543 |
+
region_distribution: Distribution of objects by region
|
544 |
+
|
545 |
+
Returns:
|
546 |
+
Dict: Identified crossing zones
|
547 |
+
"""
|
548 |
+
crossing_zones = {}
|
549 |
+
|
550 |
+
if not pedestrians:
|
551 |
+
return crossing_zones
|
552 |
+
|
553 |
+
# Group pedestrians by region
|
554 |
+
pedestrian_regions = {}
|
555 |
+
for p in pedestrians:
|
556 |
+
region = p["region"]
|
557 |
+
if region not in pedestrian_regions:
|
558 |
+
pedestrian_regions[region] = []
|
559 |
+
pedestrian_regions[region].append(p)
|
560 |
+
|
561 |
+
# Sort regions by pedestrian count to find main crossing areas
|
562 |
+
sorted_regions = sorted(pedestrian_regions.items(), key=lambda x: len(x[1]), reverse=True)
|
563 |
+
|
564 |
+
# Create crossing zones for regions with pedestrians
|
565 |
+
for idx, (region, peds) in enumerate(sorted_regions[:2]): # Focus on top 2 regions
|
566 |
+
# Check if there are traffic lights nearby to indicate a crosswalk
|
567 |
+
has_nearby_signals = any(t["region"] == region for t in traffic_lights)
|
568 |
+
|
569 |
+
# Create crossing zone with descriptive naming
|
570 |
+
zone_name = f"crossing_zone_{idx+1}"
|
571 |
+
direction = self._get_directional_description(region)
|
572 |
+
|
573 |
+
description = f"Pedestrian crossing area with {len(peds)} "
|
574 |
+
description += "person" if len(peds) == 1 else "people"
|
575 |
+
if direction:
|
576 |
+
description += f" in {direction} direction"
|
577 |
+
if has_nearby_signals:
|
578 |
+
description += " near traffic signals"
|
579 |
+
|
580 |
+
crossing_zones[zone_name] = {
|
581 |
+
"region": region,
|
582 |
+
"objects": ["pedestrian"] * len(peds),
|
583 |
+
"description": description
|
584 |
+
}
|
585 |
+
|
586 |
+
return crossing_zones
|
587 |
+
|
588 |
+
def _analyze_traffic_zones(self, vehicles: List[Dict], region_distribution: Dict) -> Dict:
|
589 |
+
"""
|
590 |
+
Analyze vehicle distribution to identify traffic zones with directional awareness.
|
591 |
+
|
592 |
+
Args:
|
593 |
+
vehicles: List of vehicle objects
|
594 |
+
region_distribution: Distribution of objects by region
|
595 |
+
|
596 |
+
Returns:
|
597 |
+
Dict: Identified traffic zones
|
598 |
+
"""
|
599 |
+
traffic_zones = {}
|
600 |
+
|
601 |
+
if not vehicles:
|
602 |
+
return traffic_zones
|
603 |
+
|
604 |
+
# Group vehicles by region
|
605 |
+
vehicle_regions = {}
|
606 |
+
for v in vehicles:
|
607 |
+
region = v["region"]
|
608 |
+
if region not in vehicle_regions:
|
609 |
+
vehicle_regions[region] = []
|
610 |
+
vehicle_regions[region].append(v)
|
611 |
+
|
612 |
+
# Create traffic zones for regions with vehicles
|
613 |
+
main_traffic_region = max(vehicle_regions.items(), key=lambda x: len(x[1]), default=(None, []))
|
614 |
+
|
615 |
+
if main_traffic_region[0] is not None:
|
616 |
+
region = main_traffic_region[0]
|
617 |
+
vehicles_in_region = main_traffic_region[1]
|
618 |
+
|
619 |
+
# Get a list of vehicle types for description
|
620 |
+
vehicle_types = [v["class_name"] for v in vehicles_in_region]
|
621 |
+
unique_types = list(set(vehicle_types))
|
622 |
+
|
623 |
+
# Get directional description
|
624 |
+
direction = self._get_directional_description(region)
|
625 |
+
|
626 |
+
# Create descriptive zone
|
627 |
+
traffic_zones["vehicle_zone"] = {
|
628 |
+
"region": region,
|
629 |
+
"objects": vehicle_types,
|
630 |
+
"description": f"Vehicle traffic area with {', '.join(unique_types[:3])}" +
|
631 |
+
(f" in {direction} area" if direction else "")
|
632 |
+
}
|
633 |
+
|
634 |
+
# If vehicles are distributed across multiple regions, create secondary zones
|
635 |
+
if len(vehicle_regions) > 1:
|
636 |
+
# Get second most populated region
|
637 |
+
sorted_regions = sorted(vehicle_regions.items(), key=lambda x: len(x[1]), reverse=True)
|
638 |
+
if len(sorted_regions) > 1:
|
639 |
+
second_region, second_vehicles = sorted_regions[1]
|
640 |
+
direction = self._get_directional_description(second_region)
|
641 |
+
vehicle_types = [v["class_name"] for v in second_vehicles]
|
642 |
+
unique_types = list(set(vehicle_types))
|
643 |
+
|
644 |
+
traffic_zones["secondary_vehicle_zone"] = {
|
645 |
+
"region": second_region,
|
646 |
+
"objects": vehicle_types,
|
647 |
+
"description": f"Secondary traffic area with {', '.join(unique_types[:2])}" +
|
648 |
+
(f" in {direction} direction" if direction else "")
|
649 |
+
}
|
650 |
+
|
651 |
+
return traffic_zones
|
652 |
+
|
653 |
+
def _get_directional_description(self, region: str) -> str:
|
654 |
+
"""
|
655 |
+
Convert region name to a directional description.
|
656 |
+
|
657 |
+
Args:
|
658 |
+
region: Region name from the grid
|
659 |
+
|
660 |
+
Returns:
|
661 |
+
str: Directional description
|
662 |
+
"""
|
663 |
+
if "top" in region and "left" in region:
|
664 |
+
return "northwest"
|
665 |
+
elif "top" in region and "right" in region:
|
666 |
+
return "northeast"
|
667 |
+
elif "bottom" in region and "left" in region:
|
668 |
+
return "southwest"
|
669 |
+
elif "bottom" in region and "right" in region:
|
670 |
+
return "southeast"
|
671 |
+
elif "top" in region:
|
672 |
+
return "north"
|
673 |
+
elif "bottom" in region:
|
674 |
+
return "south"
|
675 |
+
elif "left" in region:
|
676 |
+
return "west"
|
677 |
+
elif "right" in region:
|
678 |
+
return "east"
|
679 |
+
else:
|
680 |
+
return "central"
|
681 |
+
|
682 |
+
def _create_distribution_map(self, detected_objects: List[Dict]) -> Dict:
|
683 |
+
"""
|
684 |
+
Create a distribution map of objects across regions for spatial analysis.
|
685 |
+
|
686 |
+
Args:
|
687 |
+
detected_objects: List of detected objects
|
688 |
+
|
689 |
+
Returns:
|
690 |
+
Dict: Distribution map of objects by region and class
|
691 |
+
"""
|
692 |
+
distribution = {}
|
693 |
+
|
694 |
+
# Initialize all regions
|
695 |
+
for region in self.regions.keys():
|
696 |
+
distribution[region] = {
|
697 |
+
"total": 0,
|
698 |
+
"objects": {},
|
699 |
+
"density": 0
|
700 |
+
}
|
701 |
+
|
702 |
+
# Populate the distribution
|
703 |
+
for obj in detected_objects:
|
704 |
+
region = obj["region"]
|
705 |
+
class_id = obj["class_id"]
|
706 |
+
class_name = obj["class_name"]
|
707 |
+
|
708 |
+
distribution[region]["total"] += 1
|
709 |
+
|
710 |
+
if class_id not in distribution[region]["objects"]:
|
711 |
+
distribution[region]["objects"][class_id] = {
|
712 |
+
"name": class_name,
|
713 |
+
"count": 0,
|
714 |
+
"positions": []
|
715 |
+
}
|
716 |
+
|
717 |
+
distribution[region]["objects"][class_id]["count"] += 1
|
718 |
+
|
719 |
+
# Store position for spatial relationship analysis
|
720 |
+
if "normalized_center" in obj:
|
721 |
+
distribution[region]["objects"][class_id]["positions"].append(obj["normalized_center"])
|
722 |
+
|
723 |
+
# Calculate object density for each region
|
724 |
+
for region, data in distribution.items():
|
725 |
+
# Assuming all regions are equal size in the grid
|
726 |
+
data["density"] = data["total"] / 1
|
727 |
+
|
728 |
+
return distribution
|
729 |
+
|
730 |
+
def _identify_asian_cultural_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
|
731 |
+
"""
|
732 |
+
Identify functional zones for scenes with Asian cultural context.
|
733 |
+
|
734 |
+
Args:
|
735 |
+
category_regions: Objects grouped by category and region
|
736 |
+
detected_objects: List of detected objects
|
737 |
+
scene_type: Specific scene type
|
738 |
+
|
739 |
+
Returns:
|
740 |
+
Dict: Asian cultural functional zones
|
741 |
+
"""
|
742 |
+
zones = {}
|
743 |
+
|
744 |
+
# Identify storefront zone
|
745 |
+
storefront_items = []
|
746 |
+
storefront_regions = {}
|
747 |
+
|
748 |
+
# Since storefronts aren't directly detectable, infer from context
|
749 |
+
# For example, look for regions with signs, people, and smaller objects
|
750 |
+
sign_regions = set()
|
751 |
+
for obj in detected_objects:
|
752 |
+
if obj["class_id"] == 0: # Person
|
753 |
+
region = obj["region"]
|
754 |
+
if region not in storefront_regions:
|
755 |
+
storefront_regions[region] = []
|
756 |
+
storefront_regions[region].append(obj)
|
757 |
+
|
758 |
+
# Add regions with people as potential storefront areas
|
759 |
+
sign_regions.add(region)
|
760 |
+
|
761 |
+
# Use the areas with most people as storefront zones
|
762 |
+
if storefront_regions:
|
763 |
+
main_storefront_regions = sorted(storefront_regions.items(),
|
764 |
+
key=lambda x: len(x[1]),
|
765 |
+
reverse=True)[:2] # Top 2 regions
|
766 |
+
|
767 |
+
for idx, (region, objs) in enumerate(main_storefront_regions):
|
768 |
+
zones[f"commercial_zone_{idx+1}"] = {
|
769 |
+
"region": region,
|
770 |
+
"objects": [obj["class_name"] for obj in objs],
|
771 |
+
"description": f"Asian commercial storefront with pedestrian activity"
|
772 |
+
}
|
773 |
+
|
774 |
+
# Identify pedestrian pathway - enhanced to better detect linear pathways
|
775 |
+
pathway_items = []
|
776 |
+
pathway_regions = {}
|
777 |
+
|
778 |
+
# Extract people for pathway analysis
|
779 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
780 |
+
|
781 |
+
# Analyze if people form a line (typical of shopping streets)
|
782 |
+
people_positions = [obj["normalized_center"] for obj in people_objs]
|
783 |
+
|
784 |
+
structured_path = False
|
785 |
+
if len(people_positions) >= 3:
|
786 |
+
# Check if people are arranged along a similar y-coordinate (horizontal path)
|
787 |
+
y_coords = [pos[1] for pos in people_positions]
|
788 |
+
y_mean = sum(y_coords) / len(y_coords)
|
789 |
+
y_variance = sum((y - y_mean)**2 for y in y_coords) / len(y_coords)
|
790 |
+
|
791 |
+
horizontal_path = y_variance < 0.05 # Low variance indicates horizontal alignment
|
792 |
+
|
793 |
+
# Check if people are arranged along a similar x-coordinate (vertical path)
|
794 |
+
x_coords = [pos[0] for pos in people_positions]
|
795 |
+
x_mean = sum(x_coords) / len(x_coords)
|
796 |
+
x_variance = sum((x - x_mean)**2 for x in x_coords) / len(x_coords)
|
797 |
+
|
798 |
+
vertical_path = x_variance < 0.05 # Low variance indicates vertical alignment
|
799 |
+
|
800 |
+
structured_path = horizontal_path or vertical_path
|
801 |
+
path_direction = "horizontal" if horizontal_path else "vertical" if vertical_path else "meandering"
|
802 |
+
|
803 |
+
# Collect pathway objects (people, bicycles, motorcycles in middle area)
|
804 |
+
for obj in detected_objects:
|
805 |
+
if obj["class_id"] in [0, 1, 3]: # Person, bicycle, motorcycle
|
806 |
+
y_pos = obj["normalized_center"][1]
|
807 |
+
# Group by vertical position (middle of image likely pathway)
|
808 |
+
if 0.25 <= y_pos <= 0.75:
|
809 |
+
region = obj["region"]
|
810 |
+
if region not in pathway_regions:
|
811 |
+
pathway_regions[region] = []
|
812 |
+
pathway_regions[region].append(obj)
|
813 |
+
pathway_items.append(obj["class_name"])
|
814 |
+
|
815 |
+
if pathway_items:
|
816 |
+
path_desc = "Pedestrian walkway with people moving through the commercial area"
|
817 |
+
if structured_path:
|
818 |
+
path_desc = f"{path_direction.capitalize()} pedestrian walkway with organized foot traffic"
|
819 |
+
|
820 |
+
zones["pedestrian_pathway"] = {
|
821 |
+
"region": "middle_center", # Assumption: pathway often in middle
|
822 |
+
"objects": list(set(pathway_items)),
|
823 |
+
"description": path_desc
|
824 |
+
}
|
825 |
+
|
826 |
+
# Identify vendor zone (small stalls/shops - inferred from context)
|
827 |
+
has_small_objects = any(obj["class_id"] in [24, 26, 39, 41] for obj in detected_objects) # bags, bottles, cups
|
828 |
+
has_people = any(obj["class_id"] == 0 for obj in detected_objects)
|
829 |
+
|
830 |
+
if has_small_objects and has_people:
|
831 |
+
# Likely vendor areas are where people and small objects cluster
|
832 |
+
small_obj_regions = {}
|
833 |
+
|
834 |
+
for obj in detected_objects:
|
835 |
+
if obj["class_id"] in [24, 26, 39, 41, 67]: # bags, bottles, cups, phones
|
836 |
+
region = obj["region"]
|
837 |
+
if region not in small_obj_regions:
|
838 |
+
small_obj_regions[region] = []
|
839 |
+
small_obj_regions[region].append(obj)
|
840 |
+
|
841 |
+
if small_obj_regions:
|
842 |
+
main_vendor_region = max(small_obj_regions.items(),
|
843 |
+
key=lambda x: len(x[1]),
|
844 |
+
default=(None, []))
|
845 |
+
|
846 |
+
if main_vendor_region[0] is not None:
|
847 |
+
vendor_items = [obj["class_name"] for obj in main_vendor_region[1]]
|
848 |
+
zones["vendor_zone"] = {
|
849 |
+
"region": main_vendor_region[0],
|
850 |
+
"objects": list(set(vendor_items)),
|
851 |
+
"description": "Vendor or market stall area with small merchandise"
|
852 |
+
}
|
853 |
+
|
854 |
+
# For night markets, identify illuminated zones
|
855 |
+
if scene_type == "asian_night_market":
|
856 |
+
# Night markets typically have bright spots for food stalls
|
857 |
+
# This would be enhanced with lighting analysis integration
|
858 |
+
zones["food_stall_zone"] = {
|
859 |
+
"region": "middle_center",
|
860 |
+
"objects": ["inferred food stalls"],
|
861 |
+
"description": "Food stall area typical of Asian night markets"
|
862 |
+
}
|
863 |
+
|
864 |
+
return zones
|
865 |
+
|
866 |
+
def _identify_upscale_dining_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
|
867 |
+
"""
|
868 |
+
Identify functional zones for upscale dining settings.
|
869 |
+
|
870 |
+
Args:
|
871 |
+
category_regions: Objects grouped by category and region
|
872 |
+
detected_objects: List of detected objects
|
873 |
+
|
874 |
+
Returns:
|
875 |
+
Dict: Upscale dining functional zones
|
876 |
+
"""
|
877 |
+
zones = {}
|
878 |
+
|
879 |
+
# Identify dining table zone
|
880 |
+
dining_items = []
|
881 |
+
dining_regions = {}
|
882 |
+
|
883 |
+
for obj in detected_objects:
|
884 |
+
if obj["class_id"] in [40, 41, 42, 43, 44, 45, 60]: # Wine glass, cup, fork, knife, spoon, bowl, table
|
885 |
+
region = obj["region"]
|
886 |
+
if region not in dining_regions:
|
887 |
+
dining_regions[region] = []
|
888 |
+
dining_regions[region].append(obj)
|
889 |
+
dining_items.append(obj["class_name"])
|
890 |
+
|
891 |
+
if dining_items:
|
892 |
+
main_dining_region = max(dining_regions.items(),
|
893 |
+
key=lambda x: len(x[1]),
|
894 |
+
default=(None, []))
|
895 |
+
|
896 |
+
if main_dining_region[0] is not None:
|
897 |
+
zones["formal_dining_zone"] = {
|
898 |
+
"region": main_dining_region[0],
|
899 |
+
"objects": list(set(dining_items)),
|
900 |
+
"description": f"Formal dining area with {', '.join(list(set(dining_items))[:3])}"
|
901 |
+
}
|
902 |
+
|
903 |
+
# Identify decorative zone with enhanced detection
|
904 |
+
decor_items = []
|
905 |
+
decor_regions = {}
|
906 |
+
|
907 |
+
# Look for decorative elements (vases, wine glasses, unused dishes)
|
908 |
+
for obj in detected_objects:
|
909 |
+
if obj["class_id"] in [75, 40]: # Vase, wine glass
|
910 |
+
region = obj["region"]
|
911 |
+
if region not in decor_regions:
|
912 |
+
decor_regions[region] = []
|
913 |
+
decor_regions[region].append(obj)
|
914 |
+
decor_items.append(obj["class_name"])
|
915 |
+
|
916 |
+
if decor_items:
|
917 |
+
main_decor_region = max(decor_regions.items(),
|
918 |
+
key=lambda x: len(x[1]),
|
919 |
+
default=(None, []))
|
920 |
+
|
921 |
+
if main_decor_region[0] is not None:
|
922 |
+
zones["decorative_zone"] = {
|
923 |
+
"region": main_decor_region[0],
|
924 |
+
"objects": list(set(decor_items)),
|
925 |
+
"description": f"Decorative area with {', '.join(list(set(decor_items)))}"
|
926 |
+
}
|
927 |
+
|
928 |
+
# Identify seating arrangement zone
|
929 |
+
chairs = [obj for obj in detected_objects if obj["class_id"] == 56] # chairs
|
930 |
+
if len(chairs) >= 2:
|
931 |
+
chair_regions = {}
|
932 |
+
for obj in chairs:
|
933 |
+
region = obj["region"]
|
934 |
+
if region not in chair_regions:
|
935 |
+
chair_regions[region] = []
|
936 |
+
chair_regions[region].append(obj)
|
937 |
+
|
938 |
+
if chair_regions:
|
939 |
+
main_seating_region = max(chair_regions.items(),
|
940 |
+
key=lambda x: len(x[1]),
|
941 |
+
default=(None, []))
|
942 |
+
|
943 |
+
if main_seating_region[0] is not None:
|
944 |
+
zones["dining_seating_zone"] = {
|
945 |
+
"region": main_seating_region[0],
|
946 |
+
"objects": ["chair"] * len(main_seating_region[1]),
|
947 |
+
"description": f"Formal dining seating arrangement with {len(main_seating_region[1])} chairs"
|
948 |
+
}
|
949 |
+
|
950 |
+
# Identify serving area (if different from dining area)
|
951 |
+
serving_items = []
|
952 |
+
serving_regions = {}
|
953 |
+
|
954 |
+
# Serving areas might have bottles, bowls, containers
|
955 |
+
for obj in detected_objects:
|
956 |
+
if obj["class_id"] in [39, 45]: # Bottle, bowl
|
957 |
+
# Check if it's in a different region from the main dining table
|
958 |
+
if "formal_dining_zone" in zones and obj["region"] != zones["formal_dining_zone"]["region"]:
|
959 |
+
region = obj["region"]
|
960 |
+
if region not in serving_regions:
|
961 |
+
serving_regions[region] = []
|
962 |
+
serving_regions[region].append(obj)
|
963 |
+
serving_items.append(obj["class_name"])
|
964 |
+
|
965 |
+
if serving_items:
|
966 |
+
main_serving_region = max(serving_regions.items(),
|
967 |
+
key=lambda x: len(x[1]),
|
968 |
+
default=(None, []))
|
969 |
+
|
970 |
+
if main_serving_region[0] is not None:
|
971 |
+
zones["serving_zone"] = {
|
972 |
+
"region": main_serving_region[0],
|
973 |
+
"objects": list(set(serving_items)),
|
974 |
+
"description": f"Serving or sideboard area with {', '.join(list(set(serving_items)))}"
|
975 |
+
}
|
976 |
+
|
977 |
+
return zones
|
978 |
+
|
979 |
+
def _identify_financial_district_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
|
980 |
+
"""
|
981 |
+
Identify functional zones for financial district scenes.
|
982 |
+
|
983 |
+
Args:
|
984 |
+
category_regions: Objects grouped by category and region
|
985 |
+
detected_objects: List of detected objects
|
986 |
+
|
987 |
+
Returns:
|
988 |
+
Dict: Financial district functional zones
|
989 |
+
"""
|
990 |
+
zones = {}
|
991 |
+
|
992 |
+
# Identify traffic zone
|
993 |
+
traffic_items = []
|
994 |
+
traffic_regions = {}
|
995 |
+
|
996 |
+
for obj in detected_objects:
|
997 |
+
if obj["class_id"] in [1, 2, 3, 5, 6, 7, 9]: # Various vehicles and traffic lights
|
998 |
+
region = obj["region"]
|
999 |
+
if region not in traffic_regions:
|
1000 |
+
traffic_regions[region] = []
|
1001 |
+
traffic_regions[region].append(obj)
|
1002 |
+
traffic_items.append(obj["class_name"])
|
1003 |
+
|
1004 |
+
if traffic_items:
|
1005 |
+
main_traffic_region = max(traffic_regions.items(),
|
1006 |
+
key=lambda x: len(x[1]),
|
1007 |
+
default=(None, []))
|
1008 |
+
|
1009 |
+
if main_traffic_region[0] is not None:
|
1010 |
+
zones["traffic_zone"] = {
|
1011 |
+
"region": main_traffic_region[0],
|
1012 |
+
"objects": list(set(traffic_items)),
|
1013 |
+
"description": f"Urban traffic area with {', '.join(list(set(traffic_items))[:3])}"
|
1014 |
+
}
|
1015 |
+
|
1016 |
+
# Building zones on the sides (inferred from scene context)
|
1017 |
+
# Enhanced to check if there are actual regions that might contain buildings
|
1018 |
+
# Check for regions without vehicles or pedestrians - likely building areas
|
1019 |
+
left_side_regions = ["top_left", "middle_left", "bottom_left"]
|
1020 |
+
right_side_regions = ["top_right", "middle_right", "bottom_right"]
|
1021 |
+
|
1022 |
+
# Check left side
|
1023 |
+
left_building_evidence = True
|
1024 |
+
for region in left_side_regions:
|
1025 |
+
# If many vehicles or people in this region, less likely to be buildings
|
1026 |
+
vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
|
1027 |
+
for obj in detected_objects)
|
1028 |
+
people_in_region = any(obj["region"] == region and obj["class_id"] == 0
|
1029 |
+
for obj in detected_objects)
|
1030 |
+
|
1031 |
+
if vehicle_in_region or people_in_region:
|
1032 |
+
left_building_evidence = False
|
1033 |
+
break
|
1034 |
+
|
1035 |
+
# Check right side
|
1036 |
+
right_building_evidence = True
|
1037 |
+
for region in right_side_regions:
|
1038 |
+
# If many vehicles or people in this region, less likely to be buildings
|
1039 |
+
vehicle_in_region = any(obj["region"] == region and obj["class_id"] in [1, 2, 3, 5, 7]
|
1040 |
+
for obj in detected_objects)
|
1041 |
+
people_in_region = any(obj["region"] == region and obj["class_id"] == 0
|
1042 |
+
for obj in detected_objects)
|
1043 |
+
|
1044 |
+
if vehicle_in_region or people_in_region:
|
1045 |
+
right_building_evidence = False
|
1046 |
+
break
|
1047 |
+
|
1048 |
+
# Add building zones if evidence supports them
|
1049 |
+
if left_building_evidence:
|
1050 |
+
zones["building_zone_left"] = {
|
1051 |
+
"region": "middle_left",
|
1052 |
+
"objects": ["building"], # Inferred
|
1053 |
+
"description": "Tall buildings line the left side of the street"
|
1054 |
+
}
|
1055 |
+
|
1056 |
+
if right_building_evidence:
|
1057 |
+
zones["building_zone_right"] = {
|
1058 |
+
"region": "middle_right",
|
1059 |
+
"objects": ["building"], # Inferred
|
1060 |
+
"description": "Tall buildings line the right side of the street"
|
1061 |
+
}
|
1062 |
+
|
1063 |
+
# Identify pedestrian zone if people are present
|
1064 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
1065 |
+
if people_objs:
|
1066 |
+
people_regions = {}
|
1067 |
+
for obj in people_objs:
|
1068 |
+
region = obj["region"]
|
1069 |
+
if region not in people_regions:
|
1070 |
+
people_regions[region] = []
|
1071 |
+
people_regions[region].append(obj)
|
1072 |
+
|
1073 |
+
if people_regions:
|
1074 |
+
main_pedestrian_region = max(people_regions.items(),
|
1075 |
+
key=lambda x: len(x[1]),
|
1076 |
+
default=(None, []))
|
1077 |
+
|
1078 |
+
if main_pedestrian_region[0] is not None:
|
1079 |
+
zones["pedestrian_zone"] = {
|
1080 |
+
"region": main_pedestrian_region[0],
|
1081 |
+
"objects": ["person"] * len(main_pedestrian_region[1]),
|
1082 |
+
"description": f"Pedestrian area with {len(main_pedestrian_region[1])} people navigating the financial district"
|
1083 |
+
}
|
1084 |
+
|
1085 |
+
return zones
|
1086 |
+
|
1087 |
+
def _identify_aerial_view_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
|
1088 |
+
"""
|
1089 |
+
Identify functional zones for scenes viewed from an aerial perspective.
|
1090 |
+
|
1091 |
+
Args:
|
1092 |
+
category_regions: Objects grouped by category and region
|
1093 |
+
detected_objects: List of detected objects
|
1094 |
+
scene_type: Specific scene type
|
1095 |
+
|
1096 |
+
Returns:
|
1097 |
+
Dict: Aerial view functional zones
|
1098 |
+
"""
|
1099 |
+
zones = {}
|
1100 |
+
|
1101 |
+
# For aerial views, we focus on patterns and flows rather than specific zones
|
1102 |
+
|
1103 |
+
# Identify pedestrian patterns
|
1104 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
1105 |
+
if people_objs:
|
1106 |
+
# Convert positions to arrays for pattern analysis
|
1107 |
+
positions = np.array([obj["normalized_center"] for obj in people_objs])
|
1108 |
+
|
1109 |
+
if len(positions) >= 3:
|
1110 |
+
# Calculate distribution metrics
|
1111 |
+
x_coords = positions[:, 0]
|
1112 |
+
y_coords = positions[:, 1]
|
1113 |
+
|
1114 |
+
x_mean = np.mean(x_coords)
|
1115 |
+
y_mean = np.mean(y_coords)
|
1116 |
+
x_std = np.std(x_coords)
|
1117 |
+
y_std = np.std(y_coords)
|
1118 |
+
|
1119 |
+
# Determine if people are organized in a linear pattern
|
1120 |
+
if x_std < 0.1 or y_std < 0.1:
|
1121 |
+
# Linear distribution along one axis
|
1122 |
+
pattern_direction = "vertical" if x_std < y_std else "horizontal"
|
1123 |
+
|
1124 |
+
zones["pedestrian_pattern"] = {
|
1125 |
+
"region": "central",
|
1126 |
+
"objects": ["person"] * len(people_objs),
|
1127 |
+
"description": f"Aerial view shows a {pattern_direction} pedestrian movement pattern"
|
1128 |
+
}
|
1129 |
+
else:
|
1130 |
+
# More dispersed pattern
|
1131 |
+
zones["pedestrian_distribution"] = {
|
1132 |
+
"region": "wide",
|
1133 |
+
"objects": ["person"] * len(people_objs),
|
1134 |
+
"description": f"Aerial view shows pedestrians distributed across the area"
|
1135 |
+
}
|
1136 |
+
|
1137 |
+
# Identify vehicle patterns for traffic analysis
|
1138 |
+
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
|
1139 |
+
if vehicle_objs:
|
1140 |
+
# Convert positions to arrays for pattern analysis
|
1141 |
+
positions = np.array([obj["normalized_center"] for obj in vehicle_objs])
|
1142 |
+
|
1143 |
+
if len(positions) >= 2:
|
1144 |
+
# Calculate distribution metrics
|
1145 |
+
x_coords = positions[:, 0]
|
1146 |
+
y_coords = positions[:, 1]
|
1147 |
+
|
1148 |
+
x_mean = np.mean(x_coords)
|
1149 |
+
y_mean = np.mean(y_coords)
|
1150 |
+
x_std = np.std(x_coords)
|
1151 |
+
y_std = np.std(y_coords)
|
1152 |
+
|
1153 |
+
# Determine if vehicles are organized in lanes
|
1154 |
+
if x_std < y_std * 0.5:
|
1155 |
+
# Vehicles aligned vertically - indicates north-south traffic
|
1156 |
+
zones["vertical_traffic_flow"] = {
|
1157 |
+
"region": "central_vertical",
|
1158 |
+
"objects": [obj["class_name"] for obj in vehicle_objs[:5]],
|
1159 |
+
"description": "North-south traffic flow visible from aerial view"
|
1160 |
+
}
|
1161 |
+
elif y_std < x_std * 0.5:
|
1162 |
+
# Vehicles aligned horizontally - indicates east-west traffic
|
1163 |
+
zones["horizontal_traffic_flow"] = {
|
1164 |
+
"region": "central_horizontal",
|
1165 |
+
"objects": [obj["class_name"] for obj in vehicle_objs[:5]],
|
1166 |
+
"description": "East-west traffic flow visible from aerial view"
|
1167 |
+
}
|
1168 |
+
else:
|
1169 |
+
# Vehicles in multiple directions - indicates intersection
|
1170 |
+
zones["intersection_traffic"] = {
|
1171 |
+
"region": "central",
|
1172 |
+
"objects": [obj["class_name"] for obj in vehicle_objs[:5]],
|
1173 |
+
"description": "Multi-directional traffic at intersection visible from aerial view"
|
1174 |
+
}
|
1175 |
+
|
1176 |
+
# For intersection specific aerial views, identify crossing patterns
|
1177 |
+
if "intersection" in scene_type:
|
1178 |
+
# Check for traffic signals
|
1179 |
+
traffic_light_objs = [obj for obj in detected_objects if obj["class_id"] == 9]
|
1180 |
+
if traffic_light_objs:
|
1181 |
+
zones["traffic_control_pattern"] = {
|
1182 |
+
"region": "intersection",
|
1183 |
+
"objects": ["traffic light"] * len(traffic_light_objs),
|
1184 |
+
"description": f"Intersection traffic control with {len(traffic_light_objs)} signals visible from above"
|
1185 |
+
}
|
1186 |
+
|
1187 |
+
# Crosswalks are inferred from context in aerial views
|
1188 |
+
zones["crossing_pattern"] = {
|
1189 |
+
"region": "central",
|
1190 |
+
"objects": ["inferred crosswalk"],
|
1191 |
+
"description": "Crossing pattern visible from aerial perspective"
|
1192 |
+
}
|
1193 |
+
|
1194 |
+
# For plaza aerial views, identify gathering patterns
|
1195 |
+
if "plaza" in scene_type:
|
1196 |
+
# Plazas typically have central open area with people
|
1197 |
+
if people_objs:
|
1198 |
+
# Check if people are clustered in central region
|
1199 |
+
central_people = [obj for obj in people_objs
|
1200 |
+
if "middle" in obj["region"]]
|
1201 |
+
|
1202 |
+
if central_people:
|
1203 |
+
zones["central_gathering"] = {
|
1204 |
+
"region": "middle_center",
|
1205 |
+
"objects": ["person"] * len(central_people),
|
1206 |
+
"description": f"Central plaza gathering area with {len(central_people)} people viewed from above"
|
1207 |
+
}
|
1208 |
+
|
1209 |
+
return zones
|
1210 |
+
|
1211 |
+
def _identify_outdoor_general_zones(self, category_regions: Dict, detected_objects: List[Dict], scene_type: str) -> Dict:
|
1212 |
+
"""
|
1213 |
+
Identify functional zones for general outdoor scenes.
|
1214 |
+
|
1215 |
+
Args:
|
1216 |
+
category_regions: Objects grouped by category and region
|
1217 |
+
detected_objects: List of detected objects
|
1218 |
+
scene_type: Specific outdoor scene type
|
1219 |
+
|
1220 |
+
Returns:
|
1221 |
+
Dict: Outdoor functional zones
|
1222 |
+
"""
|
1223 |
+
zones = {}
|
1224 |
+
|
1225 |
+
# Identify pedestrian zones
|
1226 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
1227 |
+
if people_objs:
|
1228 |
+
people_regions = {}
|
1229 |
+
for obj in people_objs:
|
1230 |
+
region = obj["region"]
|
1231 |
+
if region not in people_regions:
|
1232 |
+
people_regions[region] = []
|
1233 |
+
people_regions[region].append(obj)
|
1234 |
+
|
1235 |
+
if people_regions:
|
1236 |
+
# Find main pedestrian areas
|
1237 |
+
main_people_regions = sorted(people_regions.items(),
|
1238 |
+
key=lambda x: len(x[1]),
|
1239 |
+
reverse=True)[:2] # Top 2 regions
|
1240 |
+
|
1241 |
+
for idx, (region, objs) in enumerate(main_people_regions):
|
1242 |
+
if len(objs) > 0:
|
1243 |
+
zones[f"pedestrian_zone_{idx+1}"] = {
|
1244 |
+
"region": region,
|
1245 |
+
"objects": ["person"] * len(objs),
|
1246 |
+
"description": f"Pedestrian area with {len(objs)} {'people' if len(objs) > 1 else 'person'}"
|
1247 |
+
}
|
1248 |
+
|
1249 |
+
# Identify vehicle zones for streets and parking lots
|
1250 |
+
vehicle_objs = [obj for obj in detected_objects if obj["class_id"] in [1, 2, 3, 5, 6, 7]]
|
1251 |
+
if vehicle_objs:
|
1252 |
+
vehicle_regions = {}
|
1253 |
+
for obj in vehicle_objs:
|
1254 |
+
region = obj["region"]
|
1255 |
+
if region not in vehicle_regions:
|
1256 |
+
vehicle_regions[region] = []
|
1257 |
+
vehicle_regions[region].append(obj)
|
1258 |
+
|
1259 |
+
if vehicle_regions:
|
1260 |
+
main_vehicle_region = max(vehicle_regions.items(),
|
1261 |
+
key=lambda x: len(x[1]),
|
1262 |
+
default=(None, []))
|
1263 |
+
|
1264 |
+
if main_vehicle_region[0] is not None:
|
1265 |
+
vehicle_types = [obj["class_name"] for obj in main_vehicle_region[1]]
|
1266 |
+
zones["vehicle_zone"] = {
|
1267 |
+
"region": main_vehicle_region[0],
|
1268 |
+
"objects": vehicle_types,
|
1269 |
+
"description": f"Traffic area with {', '.join(list(set(vehicle_types))[:3])}"
|
1270 |
+
}
|
1271 |
+
|
1272 |
+
# For park areas, identify recreational zones
|
1273 |
+
if scene_type == "park_area":
|
1274 |
+
# Look for recreational objects (sports balls, kites, etc.)
|
1275 |
+
rec_items = []
|
1276 |
+
rec_regions = {}
|
1277 |
+
|
1278 |
+
for obj in detected_objects:
|
1279 |
+
if obj["class_id"] in [32, 33, 34, 35, 38]: # sports ball, kite, baseball bat, glove, tennis racket
|
1280 |
+
region = obj["region"]
|
1281 |
+
if region not in rec_regions:
|
1282 |
+
rec_regions[region] = []
|
1283 |
+
rec_regions[region].append(obj)
|
1284 |
+
rec_items.append(obj["class_name"])
|
1285 |
+
|
1286 |
+
if rec_items:
|
1287 |
+
main_rec_region = max(rec_regions.items(),
|
1288 |
+
key=lambda x: len(x[1]),
|
1289 |
+
default=(None, []))
|
1290 |
+
|
1291 |
+
if main_rec_region[0] is not None:
|
1292 |
+
zones["recreational_zone"] = {
|
1293 |
+
"region": main_rec_region[0],
|
1294 |
+
"objects": list(set(rec_items)),
|
1295 |
+
"description": f"Recreational area with {', '.join(list(set(rec_items)))}"
|
1296 |
+
}
|
1297 |
+
|
1298 |
+
# For parking lots, identify parking zones
|
1299 |
+
if scene_type == "parking_lot":
|
1300 |
+
# Look for parked cars with consistent spacing
|
1301 |
+
car_objs = [obj for obj in detected_objects if obj["class_id"] == 2] # cars
|
1302 |
+
|
1303 |
+
if len(car_objs) >= 3:
|
1304 |
+
# Check if cars are arranged in patterns (simplified)
|
1305 |
+
car_positions = [obj["normalized_center"] for obj in car_objs]
|
1306 |
+
|
1307 |
+
# Check for row patterns by analyzing vertical positions
|
1308 |
+
y_coords = [pos[1] for pos in car_positions]
|
1309 |
+
y_clusters = {}
|
1310 |
+
|
1311 |
+
# Simplified clustering - group cars by similar y-coordinates
|
1312 |
+
for i, y in enumerate(y_coords):
|
1313 |
+
assigned = False
|
1314 |
+
for cluster_y in y_clusters.keys():
|
1315 |
+
if abs(y - cluster_y) < 0.1: # Within 10% of image height
|
1316 |
+
y_clusters[cluster_y].append(i)
|
1317 |
+
assigned = True
|
1318 |
+
break
|
1319 |
+
|
1320 |
+
if not assigned:
|
1321 |
+
y_clusters[y] = [i]
|
1322 |
+
|
1323 |
+
# If we have row patterns
|
1324 |
+
if max(len(indices) for indices in y_clusters.values()) >= 2:
|
1325 |
+
zones["parking_row"] = {
|
1326 |
+
"region": "central",
|
1327 |
+
"objects": ["car"] * len(car_objs),
|
1328 |
+
"description": f"Organized parking area with vehicles arranged in rows"
|
1329 |
+
}
|
1330 |
+
else:
|
1331 |
+
zones["parking_area"] = {
|
1332 |
+
"region": "wide",
|
1333 |
+
"objects": ["car"] * len(car_objs),
|
1334 |
+
"description": f"Parking area with {len(car_objs)} vehicles"
|
1335 |
+
}
|
1336 |
+
|
1337 |
+
return zones
|
1338 |
+
|
1339 |
+
def _identify_default_zones(self, category_regions: Dict, detected_objects: List[Dict]) -> Dict:
|
1340 |
+
"""
|
1341 |
+
Identify general functional zones when no specific scene type is matched.
|
1342 |
+
|
1343 |
+
Args:
|
1344 |
+
category_regions: Objects grouped by category and region
|
1345 |
+
detected_objects: List of detected objects
|
1346 |
+
|
1347 |
+
Returns:
|
1348 |
+
Dict: Default functional zones
|
1349 |
+
"""
|
1350 |
+
zones = {}
|
1351 |
+
|
1352 |
+
# Group objects by category and find main concentrations
|
1353 |
+
for category, regions in category_regions.items():
|
1354 |
+
if not regions:
|
1355 |
+
continue
|
1356 |
+
|
1357 |
+
# Find region with most objects in this category
|
1358 |
+
main_region = max(regions.items(),
|
1359 |
+
key=lambda x: len(x[1]),
|
1360 |
+
default=(None, []))
|
1361 |
+
|
1362 |
+
if main_region[0] is None or len(main_region[1]) < 2:
|
1363 |
+
continue
|
1364 |
+
|
1365 |
+
# Create zone based on object category
|
1366 |
+
zone_objects = [obj["class_name"] for obj in main_region[1]]
|
1367 |
+
|
1368 |
+
# Skip if too few objects
|
1369 |
+
if len(zone_objects) < 2:
|
1370 |
+
continue
|
1371 |
+
|
1372 |
+
# Create appropriate zone name and description based on category
|
1373 |
+
if category == "furniture":
|
1374 |
+
zones["furniture_zone"] = {
|
1375 |
+
"region": main_region[0],
|
1376 |
+
"objects": zone_objects,
|
1377 |
+
"description": f"Area with furniture including {', '.join(zone_objects[:3])}"
|
1378 |
+
}
|
1379 |
+
elif category == "electronics":
|
1380 |
+
zones["electronics_zone"] = {
|
1381 |
+
"region": main_region[0],
|
1382 |
+
"objects": zone_objects,
|
1383 |
+
"description": f"Area with electronic devices including {', '.join(zone_objects[:3])}"
|
1384 |
+
}
|
1385 |
+
elif category == "kitchen_items":
|
1386 |
+
zones["dining_zone"] = {
|
1387 |
+
"region": main_region[0],
|
1388 |
+
"objects": zone_objects,
|
1389 |
+
"description": f"Dining or food area with {', '.join(zone_objects[:3])}"
|
1390 |
+
}
|
1391 |
+
elif category == "vehicles":
|
1392 |
+
zones["vehicle_zone"] = {
|
1393 |
+
"region": main_region[0],
|
1394 |
+
"objects": zone_objects,
|
1395 |
+
"description": f"Area with vehicles including {', '.join(zone_objects[:3])}"
|
1396 |
+
}
|
1397 |
+
elif category == "personal_items":
|
1398 |
+
zones["personal_items_zone"] = {
|
1399 |
+
"region": main_region[0],
|
1400 |
+
"objects": zone_objects,
|
1401 |
+
"description": f"Area with personal items including {', '.join(zone_objects[:3])}"
|
1402 |
+
}
|
1403 |
+
|
1404 |
+
# Check for people groups
|
1405 |
+
people_objs = [obj for obj in detected_objects if obj["class_id"] == 0]
|
1406 |
+
if len(people_objs) >= 2:
|
1407 |
+
people_regions = {}
|
1408 |
+
for obj in people_objs:
|
1409 |
+
region = obj["region"]
|
1410 |
+
if region not in people_regions:
|
1411 |
+
people_regions[region] = []
|
1412 |
+
people_regions[region].append(obj)
|
1413 |
+
|
1414 |
+
if people_regions:
|
1415 |
+
main_people_region = max(people_regions.items(),
|
1416 |
+
key=lambda x: len(x[1]),
|
1417 |
+
default=(None, []))
|
1418 |
+
|
1419 |
+
if main_people_region[0] is not None:
|
1420 |
+
zones["people_zone"] = {
|
1421 |
+
"region": main_people_region[0],
|
1422 |
+
"objects": ["person"] * len(main_people_region[1]),
|
1423 |
+
"description": f"Area with {len(main_people_region[1])} people"
|
1424 |
+
}
|
1425 |
+
|
1426 |
+
return zones
|
1427 |
+
|
1428 |
+
def _find_main_region(self, region_objects_dict: Dict) -> str:
|
1429 |
+
"""Find the main region with the most objects"""
|
1430 |
+
if not region_objects_dict:
|
1431 |
+
return "unknown"
|
1432 |
+
|
1433 |
+
return max(region_objects_dict.items(),
|
1434 |
+
key=lambda x: len(x[1]),
|
1435 |
+
default=("unknown", []))[0]
|
1436 |
+
|
1437 |
+
def _find_main_region(self, region_objects_dict: Dict) -> str:
|
1438 |
+
"""Find the main region with the most objects"""
|
1439 |
+
if not region_objects_dict:
|
1440 |
+
return "unknown"
|
1441 |
+
|
1442 |
+
return max(region_objects_dict.items(),
|
1443 |
+
key=lambda x: len(x[1]),
|
1444 |
+
default=("unknown", []))[0]
|
street_04.jpg
ADDED
![]() |
Git LFS Details
|
style.py
CHANGED
@@ -1,7 +1,9 @@
|
|
|
|
1 |
class Style:
|
|
|
2 |
@staticmethod
|
3 |
def get_css():
|
4 |
-
|
5 |
css = """
|
6 |
/* Base styles and typography */
|
7 |
body {
|
@@ -13,20 +15,20 @@ class Style:
|
|
13 |
justify-content: center;
|
14 |
min-height: 100vh;
|
15 |
}
|
16 |
-
|
17 |
/* Typography improvements */
|
18 |
h1, h2, h3, h4, h5, h6, p, span, div, label, button {
|
19 |
font-family: Arial, sans-serif;
|
20 |
}
|
21 |
-
|
22 |
/* Container styling */
|
23 |
.gradio-container {
|
24 |
max-width: 1200px !important;
|
25 |
-
margin:
|
26 |
padding: 1rem;
|
27 |
width: 100%;
|
28 |
}
|
29 |
-
|
30 |
/* Header area styling with gradient background */
|
31 |
.app-header {
|
32 |
text-align: center;
|
@@ -37,7 +39,7 @@ class Style:
|
|
37 |
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
|
38 |
width: 100%;
|
39 |
}
|
40 |
-
|
41 |
.app-title {
|
42 |
color: #2D3748;
|
43 |
font-size: 2.5rem;
|
@@ -47,21 +49,21 @@ class Style:
|
|
47 |
-webkit-text-fill-color: transparent;
|
48 |
font-weight: bold;
|
49 |
}
|
50 |
-
|
51 |
.app-subtitle {
|
52 |
color: #4A5568;
|
53 |
font-size: 1.2rem;
|
54 |
font-weight: normal;
|
55 |
margin-top: 0.25rem;
|
56 |
}
|
57 |
-
|
58 |
.app-divider {
|
59 |
width: 80px;
|
60 |
height: 3px;
|
61 |
background: linear-gradient(90deg, #38b2ac, #4299e1);
|
62 |
margin: 1rem auto;
|
63 |
}
|
64 |
-
|
65 |
/* Panel styling - gradient background */
|
66 |
.input-panel, .output-panel {
|
67 |
background: white;
|
@@ -70,20 +72,20 @@ class Style:
|
|
70 |
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
|
71 |
margin: 0 auto 1rem auto;
|
72 |
}
|
73 |
-
|
74 |
-
/*
|
75 |
-
.
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
margin-top: 0.5rem;
|
81 |
-
text-align: center;
|
82 |
-
padding: 0.8rem;
|
83 |
-
background: linear-gradient(to right, #e6f3fc, #f0f9ff);
|
84 |
-
border-radius: 8px;
|
85 |
}
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
87 |
/* How-to-use section with gradient background */
|
88 |
.how-to-use {
|
89 |
background: linear-gradient(135deg, #f8fafc, #e8f4fd);
|
@@ -93,7 +95,7 @@ class Style:
|
|
93 |
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
|
94 |
color: #2d3748;
|
95 |
}
|
96 |
-
|
97 |
/* Detection button styling */
|
98 |
.detect-btn {
|
99 |
background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
|
@@ -108,41 +110,40 @@ class Style:
|
|
108 |
margin: 1rem auto !important;
|
109 |
font-family: Arial, sans-serif !important;
|
110 |
}
|
111 |
-
|
112 |
.detect-btn:hover {
|
113 |
transform: translateY(-2px) !important;
|
114 |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2) !important;
|
115 |
}
|
116 |
-
|
117 |
.detect-btn:active {
|
118 |
transform: translateY(1px) !important;
|
119 |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2) !important;
|
120 |
}
|
121 |
-
|
122 |
/* JSON display improvements */
|
123 |
-
.json-display
|
124 |
-
|
125 |
-
|
126 |
-
padding: 1rem;
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
box-shadow: inset 0 0 4px rgba(0, 0, 0, 0.1);
|
132 |
}
|
133 |
-
|
134 |
.json-key {
|
135 |
color: #e53e3e;
|
136 |
}
|
137 |
-
|
138 |
.json-value {
|
139 |
color: #2b6cb0;
|
140 |
}
|
141 |
-
|
142 |
.json-string {
|
143 |
color: #38a169;
|
144 |
}
|
145 |
-
|
146 |
/* Chart/plot styling improvements */
|
147 |
.plot-container {
|
148 |
background: white;
|
@@ -150,32 +151,39 @@ class Style:
|
|
150 |
padding: 0.5rem;
|
151 |
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
|
152 |
}
|
153 |
-
|
154 |
/* Larger font for plots */
|
155 |
.plot-container text {
|
156 |
font-family: Arial, sans-serif !important;
|
157 |
font-size: 14px !important;
|
158 |
}
|
159 |
-
|
160 |
/* Title styling for charts */
|
161 |
.plot-title {
|
162 |
font-family: Arial, sans-serif !important;
|
163 |
font-size: 16px !important;
|
164 |
font-weight: bold !important;
|
165 |
}
|
166 |
-
|
167 |
/* Tab styling with subtle gradient */
|
168 |
.tabs {
|
169 |
width: 100%;
|
170 |
display: flex;
|
171 |
justify-content: center;
|
172 |
}
|
173 |
-
|
174 |
.tabs > div:first-child {
|
175 |
background: linear-gradient(to right, #f8fafc, #e8f4fd) !important;
|
176 |
border-radius: 8px 8px 0 0;
|
177 |
}
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
/* Footer styling with gradient background */
|
180 |
.footer {
|
181 |
text-align: center;
|
@@ -188,7 +196,7 @@ class Style:
|
|
188 |
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
|
189 |
width: 100%;
|
190 |
}
|
191 |
-
|
192 |
/* Ensure centering works for all elements */
|
193 |
.container, .gr-container, .gr-row, .gr-col {
|
194 |
display: flex;
|
@@ -197,86 +205,175 @@ class Style:
|
|
197 |
justify-content: center;
|
198 |
width: 100%;
|
199 |
}
|
200 |
-
|
201 |
-
/*
|
202 |
-
|
203 |
width: 100% !important;
|
204 |
max-width: 100% !important;
|
|
|
205 |
box-sizing: border-box !important;
|
206 |
}
|
207 |
-
|
208 |
-
|
|
|
209 |
width: 100% !important;
|
210 |
-
|
|
|
|
|
|
|
211 |
font-family: 'Arial', sans-serif !important;
|
212 |
font-size: 14px !important;
|
213 |
-
line-height: 1.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
padding: 16px !important;
|
|
|
215 |
white-space: pre-wrap !important;
|
216 |
-
|
217 |
border-radius: 8px !important;
|
218 |
-
min-height:
|
219 |
-
resize: none !important;
|
220 |
overflow-y: auto !important;
|
221 |
border: 1px solid #e2e8f0 !important;
|
|
|
222 |
display: block !important;
|
|
|
|
|
|
|
223 |
}
|
224 |
-
|
225 |
-
/*
|
226 |
-
.
|
227 |
-
width: 100% !important;
|
228 |
-
margin-top: 1.5rem;
|
229 |
-
background: linear-gradient(135deg, #f8fafc, #e8f4fd);
|
230 |
-
border-radius: 10px;
|
231 |
-
padding: 1rem;
|
232 |
-
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
|
233 |
-
}
|
234 |
-
|
235 |
-
/* 確保結果詳情面板內的元素寬度可以適應面板 */
|
236 |
-
.result-details-box > * {
|
237 |
width: 100% !important;
|
238 |
max-width: 100% !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
}
|
240 |
-
|
241 |
-
/*
|
242 |
-
.result-
|
243 |
width: 100% !important;
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
245 |
}
|
246 |
-
|
247 |
-
/*
|
248 |
-
.
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
padding: 0 !important;
|
|
|
253 |
}
|
254 |
-
|
255 |
-
/*
|
256 |
-
.
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
}
|
259 |
-
|
260 |
-
/*
|
261 |
.plot-column, .stats-column {
|
262 |
display: flex;
|
263 |
flex-direction: column;
|
264 |
padding: 1rem;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
}
|
266 |
-
|
267 |
-
/*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
@media (max-width: 768px) {
|
269 |
.app-title {
|
270 |
font-size: 2rem;
|
271 |
}
|
272 |
-
|
273 |
.app-subtitle {
|
274 |
font-size: 1rem;
|
275 |
}
|
276 |
-
|
277 |
.gradio-container {
|
278 |
padding: 0.5rem;
|
279 |
}
|
|
|
|
|
|
|
|
|
|
|
280 |
}
|
281 |
"""
|
282 |
return css
|
|
|
1 |
+
|
2 |
class Style:
|
3 |
+
|
4 |
@staticmethod
|
5 |
def get_css():
|
6 |
+
|
7 |
css = """
|
8 |
/* Base styles and typography */
|
9 |
body {
|
|
|
15 |
justify-content: center;
|
16 |
min-height: 100vh;
|
17 |
}
|
18 |
+
|
19 |
/* Typography improvements */
|
20 |
h1, h2, h3, h4, h5, h6, p, span, div, label, button {
|
21 |
font-family: Arial, sans-serif;
|
22 |
}
|
23 |
+
|
24 |
/* Container styling */
|
25 |
.gradio-container {
|
26 |
max-width: 1200px !important;
|
27 |
+
margin: auto !important;
|
28 |
padding: 1rem;
|
29 |
width: 100%;
|
30 |
}
|
31 |
+
|
32 |
/* Header area styling with gradient background */
|
33 |
.app-header {
|
34 |
text-align: center;
|
|
|
39 |
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.05);
|
40 |
width: 100%;
|
41 |
}
|
42 |
+
|
43 |
.app-title {
|
44 |
color: #2D3748;
|
45 |
font-size: 2.5rem;
|
|
|
49 |
-webkit-text-fill-color: transparent;
|
50 |
font-weight: bold;
|
51 |
}
|
52 |
+
|
53 |
.app-subtitle {
|
54 |
color: #4A5568;
|
55 |
font-size: 1.2rem;
|
56 |
font-weight: normal;
|
57 |
margin-top: 0.25rem;
|
58 |
}
|
59 |
+
|
60 |
.app-divider {
|
61 |
width: 80px;
|
62 |
height: 3px;
|
63 |
background: linear-gradient(90deg, #38b2ac, #4299e1);
|
64 |
margin: 1rem auto;
|
65 |
}
|
66 |
+
|
67 |
/* Panel styling - gradient background */
|
68 |
.input-panel, .output-panel {
|
69 |
background: white;
|
|
|
72 |
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
|
73 |
margin: 0 auto 1rem auto;
|
74 |
}
|
75 |
+
|
76 |
+
/* 修改輸出面板確保內容能夠完整顯示 */
|
77 |
+
.output-panel {
|
78 |
+
display: flex;
|
79 |
+
flex-direction: column;
|
80 |
+
width: 100%;
|
81 |
+
padding: 0 !important;
|
|
|
|
|
|
|
|
|
|
|
82 |
}
|
83 |
+
|
84 |
+
/* 確保輸出面板內的元素寬度可以適應面板 */
|
85 |
+
.output-panel > * {
|
86 |
+
width: 100%;
|
87 |
+
}
|
88 |
+
|
89 |
/* How-to-use section with gradient background */
|
90 |
.how-to-use {
|
91 |
background: linear-gradient(135deg, #f8fafc, #e8f4fd);
|
|
|
95 |
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
|
96 |
color: #2d3748;
|
97 |
}
|
98 |
+
|
99 |
/* Detection button styling */
|
100 |
.detect-btn {
|
101 |
background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
|
|
|
110 |
margin: 1rem auto !important;
|
111 |
font-family: Arial, sans-serif !important;
|
112 |
}
|
113 |
+
|
114 |
.detect-btn:hover {
|
115 |
transform: translateY(-2px) !important;
|
116 |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2) !important;
|
117 |
}
|
118 |
+
|
119 |
.detect-btn:active {
|
120 |
transform: translateY(1px) !important;
|
121 |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2) !important;
|
122 |
}
|
123 |
+
|
124 |
/* JSON display improvements */
|
125 |
+
.json-display {
|
126 |
+
width: 98% !important;
|
127 |
+
margin: 0.5rem auto 1.5rem auto !important;
|
128 |
+
padding: 1rem !important;
|
129 |
+
border-radius: 8px !important;
|
130 |
+
background-color: white !important;
|
131 |
+
border: 1px solid #E2E8F0 !important;
|
132 |
+
box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.05) !important;
|
|
|
133 |
}
|
134 |
+
|
135 |
.json-key {
|
136 |
color: #e53e3e;
|
137 |
}
|
138 |
+
|
139 |
.json-value {
|
140 |
color: #2b6cb0;
|
141 |
}
|
142 |
+
|
143 |
.json-string {
|
144 |
color: #38a169;
|
145 |
}
|
146 |
+
|
147 |
/* Chart/plot styling improvements */
|
148 |
.plot-container {
|
149 |
background: white;
|
|
|
151 |
padding: 0.5rem;
|
152 |
box-shadow: 0 2px 6px rgba(0, 0, 0, 0.05);
|
153 |
}
|
154 |
+
|
155 |
/* Larger font for plots */
|
156 |
.plot-container text {
|
157 |
font-family: Arial, sans-serif !important;
|
158 |
font-size: 14px !important;
|
159 |
}
|
160 |
+
|
161 |
/* Title styling for charts */
|
162 |
.plot-title {
|
163 |
font-family: Arial, sans-serif !important;
|
164 |
font-size: 16px !important;
|
165 |
font-weight: bold !important;
|
166 |
}
|
167 |
+
|
168 |
/* Tab styling with subtle gradient */
|
169 |
.tabs {
|
170 |
width: 100%;
|
171 |
display: flex;
|
172 |
justify-content: center;
|
173 |
}
|
174 |
+
|
175 |
.tabs > div:first-child {
|
176 |
background: linear-gradient(to right, #f8fafc, #e8f4fd) !important;
|
177 |
border-radius: 8px 8px 0 0;
|
178 |
}
|
179 |
+
|
180 |
+
/* Tab content styling - 確保內容區域有足夠寬度 */
|
181 |
+
.tab-content {
|
182 |
+
width: 100% !important;
|
183 |
+
box-sizing: border-box !important;
|
184 |
+
padding: 0 !important;
|
185 |
+
}
|
186 |
+
|
187 |
/* Footer styling with gradient background */
|
188 |
.footer {
|
189 |
text-align: center;
|
|
|
196 |
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05);
|
197 |
width: 100%;
|
198 |
}
|
199 |
+
|
200 |
/* Ensure centering works for all elements */
|
201 |
.container, .gr-container, .gr-row, .gr-col {
|
202 |
display: flex;
|
|
|
205 |
justify-content: center;
|
206 |
width: 100%;
|
207 |
}
|
208 |
+
|
209 |
+
/* 統一文本框樣式,確保寬度一致 */
|
210 |
+
.gr-textbox, .gr-textarea, .gr-text-input {
|
211 |
width: 100% !important;
|
212 |
max-width: 100% !important;
|
213 |
+
min-width: 100% !important;
|
214 |
box-sizing: border-box !important;
|
215 |
}
|
216 |
+
|
217 |
+
/* 確保文本區域可以適應容器寬度 */
|
218 |
+
textarea.gr-textarea, .gr-textbox textarea, .gr-text-input textarea {
|
219 |
width: 100% !important;
|
220 |
+
max-width: 100% !important;
|
221 |
+
min-width: 100% !important;
|
222 |
+
box-sizing: border-box !important;
|
223 |
+
padding: 16px !important;
|
224 |
font-family: 'Arial', sans-serif !important;
|
225 |
font-size: 14px !important;
|
226 |
+
line-height: 1.6 !important;
|
227 |
+
white-space: pre-wrap !important;
|
228 |
+
word-wrap: break-word !important;
|
229 |
+
word-break: normal !important;
|
230 |
+
}
|
231 |
+
|
232 |
+
/* 特別針對場景描述文本框樣式增強 */
|
233 |
+
#scene-description-text, #detection-details {
|
234 |
+
width: 100% !important;
|
235 |
+
min-width: 100% !important;
|
236 |
+
box-sizing: border-box !important;
|
237 |
padding: 16px !important;
|
238 |
+
line-height: 1.8 !important;
|
239 |
white-space: pre-wrap !important;
|
240 |
+
word-wrap: break-word !important;
|
241 |
border-radius: 8px !important;
|
242 |
+
min-height: 250px !important;
|
|
|
243 |
overflow-y: auto !important;
|
244 |
border: 1px solid #e2e8f0 !important;
|
245 |
+
background-color: white !important;
|
246 |
display: block !important;
|
247 |
+
font-family: 'Arial', sans-serif !important;
|
248 |
+
font-size: 14px !important;
|
249 |
+
margin: 0 !important;
|
250 |
}
|
251 |
+
|
252 |
+
/* 針對場景描述容器的樣式 */
|
253 |
+
.scene-description-container {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
width: 100% !important;
|
255 |
max-width: 100% !important;
|
256 |
+
box-sizing: border-box !important;
|
257 |
+
padding: 0 !important;
|
258 |
+
margin: 0 !important;
|
259 |
+
}
|
260 |
+
|
261 |
+
/* Scene Understanding Tab 特定樣式 */
|
262 |
+
.scene-understanding-tab .result-details-box {
|
263 |
+
display: flex !important;
|
264 |
+
flex-direction: column !important;
|
265 |
+
align-items: stretch !important;
|
266 |
+
width: 100% !important;
|
267 |
+
box-sizing: border-box !important;
|
268 |
+
padding: 0 !important;
|
269 |
}
|
270 |
+
|
271 |
+
/* 結果容器樣式 */
|
272 |
+
.result-container {
|
273 |
width: 100% !important;
|
274 |
+
padding: 1rem !important;
|
275 |
+
border-radius: 8px !important;
|
276 |
+
border: 1px solid #E2E8F0 !important;
|
277 |
+
margin-bottom: 1.5rem !important;
|
278 |
+
background-color: #F8FAFC !important;
|
279 |
+
box-sizing: border-box !important;
|
280 |
}
|
281 |
+
|
282 |
+
/* 結果文本框的樣式 */
|
283 |
+
.wide-result-text {
|
284 |
+
width: 100% !important;
|
285 |
+
min-width: 100% !important;
|
286 |
+
box-sizing: border-box !important;
|
287 |
padding: 0 !important;
|
288 |
+
margin: 0 !important;
|
289 |
}
|
290 |
+
|
291 |
+
/* 片段標題樣式 */
|
292 |
+
.section-heading {
|
293 |
+
font-size: 1.25rem !important;
|
294 |
+
font-weight: 600 !important;
|
295 |
+
color: #2D3748 !important;
|
296 |
+
margin: 1rem auto !important;
|
297 |
+
padding: 0.75rem 1rem !important;
|
298 |
+
background: linear-gradient(to right, #e6f3fc, #f0f9ff) !important;
|
299 |
+
border-radius: 8px !important;
|
300 |
+
width: 98% !important;
|
301 |
+
display: inline-block !important;
|
302 |
+
box-sizing: border-box !important;
|
303 |
+
text-align: center !important;
|
304 |
+
overflow: visible !important;
|
305 |
+
line-height: 1.5 !important;
|
306 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1) !important;
|
307 |
+
}
|
308 |
+
|
309 |
+
/* JSON 顯示區域樣式 */
|
310 |
+
.json-box {
|
311 |
+
width: 100% !important;
|
312 |
+
min-height: 200px !important;
|
313 |
+
overflow-y: auto !important;
|
314 |
+
background: white !important;
|
315 |
+
padding: 1rem !important;
|
316 |
+
border-radius: 8px !important;
|
317 |
+
box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
|
318 |
+
font-family: monospace !important;
|
319 |
+
box-sizing: border-box !important;
|
320 |
}
|
321 |
+
|
322 |
+
/* 欄佈局調整 */
|
323 |
.plot-column, .stats-column {
|
324 |
display: flex;
|
325 |
flex-direction: column;
|
326 |
padding: 1rem;
|
327 |
+
box-sizing: border-box !important;
|
328 |
+
width: 100% !important;
|
329 |
+
}
|
330 |
+
|
331 |
+
/* statistics plot */
|
332 |
+
.large-plot-container {
|
333 |
+
width: 100% !important;
|
334 |
+
min-height: 400px !important;
|
335 |
+
box-sizing: border-box !important;
|
336 |
+
}
|
337 |
+
|
338 |
+
/* 增強 JSON 顯示 */
|
339 |
+
.enhanced-json-display {
|
340 |
+
background: white !important;
|
341 |
+
border-radius: 8px !important;
|
342 |
+
padding: 1rem !important;
|
343 |
+
box-shadow: inset 0 0 6px rgba(0, 0, 0, 0.1) !important;
|
344 |
+
width: 100% !important;
|
345 |
+
min-height: 300px !important;
|
346 |
+
max-height: 500px !important;
|
347 |
+
overflow-y: auto !important;
|
348 |
+
font-family: monospace !important;
|
349 |
+
box-sizing: border-box !important;
|
350 |
}
|
351 |
+
|
352 |
+
/* 確保全寬元素真正占滿整個寬度 */
|
353 |
+
.full-width-element {
|
354 |
+
width: 100% !important;
|
355 |
+
max-width: 100% !important;
|
356 |
+
box-sizing: border-box !important;
|
357 |
+
}
|
358 |
+
|
359 |
+
/* 響應式調整 */
|
360 |
@media (max-width: 768px) {
|
361 |
.app-title {
|
362 |
font-size: 2rem;
|
363 |
}
|
364 |
+
|
365 |
.app-subtitle {
|
366 |
font-size: 1rem;
|
367 |
}
|
368 |
+
|
369 |
.gradio-container {
|
370 |
padding: 0.5rem;
|
371 |
}
|
372 |
+
|
373 |
+
/* 在小螢幕上調整文本區域的高度 */
|
374 |
+
#scene-description-text, #detection-details {
|
375 |
+
min-height: 150px !important;
|
376 |
+
}
|
377 |
}
|
378 |
"""
|
379 |
return css
|
viewpoint_templates.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
VIEWPOINT_TEMPLATES = {
|
3 |
+
"eye_level": {
|
4 |
+
"prefix": "From a standard eye-level perspective, ",
|
5 |
+
"observation": "the scene shows {scene_elements} arranged in a typical front-facing view."
|
6 |
+
},
|
7 |
+
"aerial": {
|
8 |
+
"prefix": "From an aerial perspective, ",
|
9 |
+
"observation": "the scene shows {scene_elements} as viewed from above, revealing the spatial layout."
|
10 |
+
},
|
11 |
+
"elevated": {
|
12 |
+
"prefix": "From an elevated viewpoint, ",
|
13 |
+
"observation": "the scene presents {scene_elements} with a slight downward angle."
|
14 |
+
},
|
15 |
+
"low_angle": {
|
16 |
+
"prefix": "From a low angle, ",
|
17 |
+
"observation": "the scene depicts {scene_elements} from below, emphasizing vertical elements."
|
18 |
+
}
|
19 |
+
}
|
visualization_helper.py
CHANGED
@@ -74,7 +74,7 @@ class VisualizationHelper:
|
|
74 |
for box, cls, conf in zip(boxes, classes, confs):
|
75 |
x1, y1, x2, y2 = box
|
76 |
cls_id = int(cls)
|
77 |
-
|
78 |
if filter_classes and cls_id not in filter_classes:
|
79 |
continue
|
80 |
|
|
|
74 |
for box, cls, conf in zip(boxes, classes, confs):
|
75 |
x1, y1, x2, y2 = box
|
76 |
cls_id = int(cls)
|
77 |
+
|
78 |
if filter_classes and cls_id not in filter_classes:
|
79 |
continue
|
80 |
|