Spaces:
Running
on
Zero
Running
on
Zero
File size: 17,069 Bytes
3172319 81a4715 3172319 0377514 3172319 0377514 3172319 c0fe80d 3172319 0377514 3172319 81a4715 3172319 0377514 3172319 c0fe80d 3172319 0377514 3172319 0377514 3172319 0377514 3172319 0377514 3172319 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 |
import os
import numpy as np
from typing import Dict, List, Tuple, Any, Optional
from spatial_analyzer import SpatialAnalyzer
from scene_description import SceneDescriptor
from enhance_scene_describer import EnhancedSceneDescriber
from clip_analyzer import CLIPAnalyzer
from scene_type import SCENE_TYPES
from object_categories import OBJECT_CATEGORIES
class SceneAnalyzer:
"""
Core class for scene analysis and understanding based on object detection results.
Analyzes detected objects, their relationships, and infers the scene type.
"""
def __init__(self, class_names: Dict[int, str] = None):
"""
Initialize the scene analyzer with optional class name mappings.
Args:
class_names: Dictionary mapping class IDs to class names (optional)
"""
self.class_names = class_names
# 加載場景類型和物體類別
self.SCENE_TYPES = SCENE_TYPES
self.OBJECT_CATEGORIES = OBJECT_CATEGORIES
# 初始化其他組件,將數據傳遞給 SceneDescriptor
self.spatial_analyzer = SpatialAnalyzer(class_names=class_names, object_categories=self.OBJECT_CATEGORIES)
self.descriptor = SceneDescriptor(scene_types=self.SCENE_TYPES, object_categories=self.OBJECT_CATEGORIES)
self.scene_describer = EnhancedSceneDescriber(scene_types=self.SCENE_TYPES)
# 初始化 CLIP 分析器
try:
self.clip_analyzer = CLIPAnalyzer()
self.use_clip = True
except Exception as e:
print(f"Warning: Could not initialize CLIP analyzer: {e}")
print("Scene analysis will proceed without CLIP. Install CLIP with 'pip install clip' for enhanced scene understanding.")
self.use_clip = False
def generate_scene_description(self,
scene_type,
detected_objects,
confidence,
lighting_info=None,
functional_zones=None):
"""
生成場景描述。
Args:
scene_type: 識別的場景類型
detected_objects: 檢測到的物體列表
confidence: 場景分類置信度
lighting_info: 照明條件信息(可選)
functional_zones: 功能區域信息(可選)
Returns:
str: 生成的場景描述
"""
return self.scene_describer.generate_description(
scene_type,
detected_objects,
confidence,
lighting_info,
functional_zones
)
def _generate_scene_description(self, scene_type, detected_objects, confidence, lighting_info=None):
"""
Use new implement
"""
# get the functional zones info
functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, scene_type)
return self.generate_scene_description(
scene_type,
detected_objects,
confidence,
lighting_info,
functional_zones
)
def _define_image_regions(self):
"""Define regions of the image for spatial analysis (3x3 grid)"""
self.regions = {
"top_left": (0, 0, 1/3, 1/3),
"top_center": (1/3, 0, 2/3, 1/3),
"top_right": (2/3, 0, 1, 1/3),
"middle_left": (0, 1/3, 1/3, 2/3),
"middle_center": (1/3, 1/3, 2/3, 2/3),
"middle_right": (2/3, 1/3, 1, 2/3),
"bottom_left": (0, 2/3, 1/3, 1),
"bottom_center": (1/3, 2/3, 2/3, 1),
"bottom_right": (2/3, 2/3, 1, 1)
}
def analyze(self, detection_result: Any, lighting_info: Optional[Dict] = None, class_confidence_threshold: float = 0.35, scene_confidence_threshold: float = 0.6) -> Dict:
"""
Analyze detection results to determine scene type and provide understanding.
Args:
detection_result: Detection result from YOLOv8
lighting_info: Optional lighting condition analysis results
class_confidence_threshold: Minimum confidence to consider an object
scene_confidence_threshold: Minimum confidence to determine a scene
Returns:
Dictionary with scene analysis results
"""
# If no result or no detections, return empty analysis
if detection_result is None or len(detection_result.boxes) == 0:
return {
"scene_type": "unknown",
"confidence": 0,
"description": "No objects detected in the image.",
"objects_present": [],
"object_count": 0,
"regions": {},
"possible_activities": [],
"safety_concerns": [],
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
}
# Get class names from detection result if not already set
if self.class_names is None:
self.class_names = detection_result.names
# Also update class names in spatial analyzer
self.spatial_analyzer.class_names = self.class_names
# Extract detected objects with confidence above threshold
detected_objects = self.spatial_analyzer._extract_detected_objects(
detection_result,
confidence_threshold=class_confidence_threshold
)
# No objects above confidence threshold
if not detected_objects:
return {
"scene_type": "unknown",
"confidence": 0,
"description": "No objects with sufficient confidence detected.",
"objects_present": [],
"object_count": 0,
"regions": {},
"possible_activities": [],
"safety_concerns": [],
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
}
# Analyze object distribution in regions
region_analysis = self.spatial_analyzer._analyze_regions(detected_objects)
# Compute scene type scores based on object detection
yolo_scene_scores = self._compute_scene_scores(detected_objects)
# 使用 CLIP 分析圖像
clip_scene_scores = {}
clip_analysis = None
if self.use_clip:
try:
# 獲取原始圖像
original_image = detection_result.orig_img
# Use CLIP analyze image
clip_analysis = self.clip_analyzer.analyze_image(original_image)
# get CLIP's score
clip_scene_scores = clip_analysis.get("scene_scores", {})
if "asian_commercial_street" in clip_scene_scores and clip_scene_scores["asian_commercial_street"] > 0.2:
# 使用對比提示進一步區分室內/室外
comparative_results = self.clip_analyzer.calculate_similarity(
original_image,
self.clip_analyzer.comparative_prompts["indoor_vs_outdoor"]
)
# 分析對比結果
indoor_score = sum(s for p, s in comparative_results.items() if "indoor" in p or "enclosed" in p)
outdoor_score = sum(s for p, s in comparative_results.items() if "outdoor" in p or "open-air" in p)
# 如果 CLIP 認為這是室外場景,且光照分析認為是室內
if outdoor_score > indoor_score and lighting_info and lighting_info.get("is_indoor", False):
# 修正光照分析結果
print(f"CLIP indicates outdoor commercial street (score: {outdoor_score:.2f} vs {indoor_score:.2f}), adjusting lighting analysis")
lighting_info["is_indoor"] = False
lighting_info["indoor_probability"] = 0.3
# 把CLIP 分析結果加到光照診斷
if "diagnostics" not in lighting_info:
lighting_info["diagnostics"] = {}
lighting_info["diagnostics"]["clip_override"] = {
"reason": "CLIP detected outdoor commercial street",
"outdoor_score": float(outdoor_score),
"indoor_score": float(indoor_score)
}
# 如果 CLIP 檢測到了光照條件但沒有提供 lighting_info
if not lighting_info and "lighting_condition" in clip_analysis:
lighting_type, lighting_conf = clip_analysis["lighting_condition"]
lighting_info = {
"time_of_day": lighting_type,
"confidence": lighting_conf
}
except Exception as e:
print(f"Error in CLIP analysis: {e}")
# 融合 YOLO 和 CLIP 的場景分數
scene_scores = self._fuse_scene_scores(yolo_scene_scores, clip_scene_scores)
# Determine best matching scene type
best_scene, scene_confidence = self._determine_scene_type(scene_scores)
# Generate possible activities based on scene
activities = self.descriptor._infer_possible_activities(best_scene, detected_objects)
# Identify potential safety concerns
safety_concerns = self.descriptor._identify_safety_concerns(detected_objects, best_scene)
# Calculate functional zones
functional_zones = self.spatial_analyzer._identify_functional_zones(detected_objects, best_scene)
# Generate scene description
scene_description = self.generate_scene_description(
best_scene,
detected_objects,
scene_confidence,
lighting_info=lighting_info,
functional_zones=functional_zones
)
# Return comprehensive analysis
result = {
"scene_type": best_scene if scene_confidence >= scene_confidence_threshold else "unknown",
"scene_name": self.SCENE_TYPES.get(best_scene, {}).get("name", "Unknown")
if scene_confidence >= scene_confidence_threshold else "Unknown Scene",
"confidence": scene_confidence,
"description": scene_description,
"objects_present": [
{"class_id": obj["class_id"],
"class_name": obj["class_name"],
"confidence": obj["confidence"]}
for obj in detected_objects
],
"object_count": len(detected_objects),
"regions": region_analysis,
"possible_activities": activities,
"safety_concerns": safety_concerns,
"functional_zones": functional_zones,
"alternative_scenes": self.descriptor._get_alternative_scenes(scene_scores, scene_confidence_threshold, top_k=2),
"lighting_conditions": lighting_info or {"time_of_day": "unknown", "confidence": 0}
}
# 添加 CLIP 特定的結果
if clip_analysis and "error" not in clip_analysis:
result["clip_analysis"] = {
"top_scene": clip_analysis.get("top_scene", ("unknown", 0)),
"cultural_analysis": clip_analysis.get("cultural_analysis", {})
}
return result
def _compute_scene_scores(self, detected_objects: List[Dict]) -> Dict[str, float]:
"""
Compute confidence scores for each scene type based on detected objects.
Args:
detected_objects: List of detected objects
Returns:
Dictionary mapping scene types to confidence scores
"""
scene_scores = {}
detected_class_ids = [obj["class_id"] for obj in detected_objects]
detected_classes_set = set(detected_class_ids)
# Count occurrence of each class
class_counts = {}
for obj in detected_objects:
class_id = obj["class_id"]
if class_id not in class_counts:
class_counts[class_id] = 0
class_counts[class_id] += 1
# Evaluate each scene type
for scene_type, scene_def in self.SCENE_TYPES.items():
# Count required objects present
required_objects = set(scene_def["required_objects"])
required_present = required_objects.intersection(detected_classes_set)
# Count optional objects present
optional_objects = set(scene_def["optional_objects"])
optional_present = optional_objects.intersection(detected_classes_set)
# Skip if minimum required objects aren't present
if len(required_present) < scene_def["minimum_required"]:
scene_scores[scene_type] = 0
continue
# Base score from required objects
required_ratio = len(required_present) / max(1, len(required_objects))
required_score = required_ratio * 0.7 # 70% of score from required objects
# Additional score from optional objects
optional_ratio = len(optional_present) / max(1, len(optional_objects))
optional_score = optional_ratio * 0.3 # 30% of score from optional objects
# Bonus for having multiple instances of key objects
multiple_bonus = 0
for class_id in required_present:
if class_counts.get(class_id, 0) > 1:
multiple_bonus += 0.05 # 5% bonus per additional key object type
# Cap the bonus at 15%
multiple_bonus = min(0.15, multiple_bonus)
# Calculate final score
final_score = required_score + optional_score + multiple_bonus
if "priority" in scene_def:
final_score *= scene_def["priority"]
# Normalize to 0-1 range
scene_scores[scene_type] = min(1.0, final_score)
return scene_scores
def _determine_scene_type(self, scene_scores: Dict[str, float]) -> Tuple[str, float]:
"""
Determine the most likely scene type based on scores.
Args:
scene_scores: Dictionary mapping scene types to confidence scores
Returns:
Tuple of (best_scene_type, confidence)
"""
if not scene_scores:
return "unknown", 0
# Find scene with highest score
best_scene = max(scene_scores, key=scene_scores.get)
best_score = scene_scores[best_scene]
return best_scene, best_score
def _fuse_scene_scores(self, yolo_scene_scores: Dict[str, float], clip_scene_scores: Dict[str, float]) -> Dict[str, float]:
"""
融合基於 YOLO 物體檢測和 CLIP 分析的場景分數。
Args:
yolo_scene_scores: 基於 YOLO 物體檢測的場景分數
clip_scene_scores: 基於 CLIP 分析的場景分數
Returns:
Dict: 融合後的場景分數
"""
# 如果沒有 CLIP 分數,直接返回 YOLO 分數
if not clip_scene_scores:
return yolo_scene_scores
# 如果沒有 YOLO 分數,直接返回 CLIP 分數
if not yolo_scene_scores:
return clip_scene_scores
# 融合分數
fused_scores = {}
# 獲取所有場景類型
all_scene_types = set(list(yolo_scene_scores.keys()) + list(clip_scene_scores.keys()))
for scene_type in all_scene_types:
# 獲取兩個模型的分數
yolo_score = yolo_scene_scores.get(scene_type, 0)
clip_score = clip_scene_scores.get(scene_type, 0)
# 設置基本權重
yolo_weight = 0.7 # YOLO 可提供比較好的物體資訊
clip_weight = 0.3 # CLIP 強項是理解整體的場景關係
# 對特定類型場景調整權重
# 文化特定場景或具有特殊布局的場景,CLIP可能比較能理解
if any(keyword in scene_type for keyword in ["asian", "cultural", "aerial"]):
yolo_weight = 0.3
clip_weight = 0.7
# 對室內家居場景,物體檢測通常更準確
elif any(keyword in scene_type for keyword in ["room", "kitchen", "office", "bedroom"]):
yolo_weight = 0.8
clip_weight = 0.2
elif scene_type == "beach_water_recreation":
yolo_weight = 0.8 # 衝浪板等特定物品的檢測
clip_weight = 0.2
elif scene_type == "sports_venue":
yolo_weight = 0.7
clip_weight = 0.3
elif scene_type == "professional_kitchen":
yolo_weight = 0.8 # 廚房用具的檢測非常重要
clip_weight = 0.2
# 計算加權分數
fused_scores[scene_type] = (yolo_score * yolo_weight) + (clip_score * clip_weight)
return fused_scores
|