Spaces:
Running
on
Zero
Running
on
Zero
| import json | |
| import sqlite3 | |
| import numpy as np | |
| from typing import Dict, List, Tuple, Any, Optional, Union | |
| from dataclasses import dataclass, field, asdict | |
| from enum import Enum | |
| import os | |
| import traceback | |
| from dog_database import get_dog_description | |
| from breed_health_info import breed_health_info | |
| from breed_noise_info import breed_noise_info | |
| class DataQuality(Enum): | |
| """資料品質等級""" | |
| HIGH = "high" # 完整且可靠的資料 | |
| MEDIUM = "medium" # 部分資料或推斷資料 | |
| LOW = "low" # 不完整或不確定的資料 | |
| UNKNOWN = "unknown" # 未知或缺失資料 | |
| class BreedStandardization: | |
| """品種標準化資料結構""" | |
| canonical_name: str | |
| display_name: str | |
| aliases: List[str] = field(default_factory=list) | |
| size_category: int = 1 # 1=tiny, 2=small, 3=medium, 4=large, 5=giant | |
| exercise_level: int = 2 # 1=low, 2=moderate, 3=high, 4=very_high | |
| noise_level: int = 2 # 1=low, 2=moderate, 3=high | |
| care_complexity: int = 2 # 1=low, 2=moderate, 3=high | |
| child_compatibility: float = 0.5 # 0=no, 0.5=unknown, 1=yes | |
| data_quality_scores: Dict[str, DataQuality] = field(default_factory=dict) | |
| confidence_flags: Dict[str, float] = field(default_factory=dict) | |
| class ConfigurationSettings: | |
| """配置設定結構""" | |
| scoring_weights: Dict[str, float] = field(default_factory=dict) | |
| calibration_settings: Dict[str, Any] = field(default_factory=dict) | |
| constraint_thresholds: Dict[str, float] = field(default_factory=dict) | |
| semantic_model_config: Dict[str, Any] = field(default_factory=dict) | |
| data_imputation_rules: Dict[str, Any] = field(default_factory=dict) | |
| debug_mode: bool = False | |
| version: str = "1.0.0" | |
| class ConfigManager: | |
| """ | |
| 中央化配置和資料標準化管理系統 | |
| 處理品種資料標準化、配置管理和資料品質評估 | |
| """ | |
| def __init__(self, config_file: Optional[str] = None): | |
| """初始化配置管理器""" | |
| self.config_file = config_file or "config.json" | |
| self.breed_standardization = {} | |
| self.configuration = ConfigurationSettings() | |
| self.breed_aliases = {} | |
| self._load_default_configuration() | |
| self._initialize_breed_standardization() | |
| # 嘗試載入自定義配置 | |
| if os.path.exists(self.config_file): | |
| self._load_configuration() | |
| def _load_default_configuration(self): | |
| """載入預設配置""" | |
| self.configuration = ConfigurationSettings( | |
| scoring_weights={ | |
| 'activity_compatibility': 0.35, | |
| 'noise_compatibility': 0.25, | |
| 'spatial_compatibility': 0.15, | |
| 'family_compatibility': 0.10, | |
| 'maintenance_compatibility': 0.10, | |
| 'size_compatibility': 0.05 | |
| }, | |
| calibration_settings={ | |
| 'target_range_min': 0.45, | |
| 'target_range_max': 0.95, | |
| 'min_effective_range': 0.3, | |
| 'auto_calibration': True, | |
| 'tie_breaking_enabled': True | |
| }, | |
| constraint_thresholds={ | |
| 'apartment_size_limit': 3, # 最大允許尺寸 (medium) | |
| 'high_exercise_threshold': 3, # 高運動需求閾值 | |
| 'quiet_noise_limit': 2, # 安靜環境噪音限制 | |
| 'child_safety_threshold': 0.8 # 兒童安全最低分數 | |
| }, | |
| semantic_model_config={ | |
| 'model_name': 'all-MiniLM-L6-v2', | |
| 'fallback_models': ['all-mpnet-base-v2', 'all-MiniLM-L12-v2'], | |
| 'similarity_threshold': 0.5, | |
| 'cache_embeddings': True | |
| }, | |
| data_imputation_rules={ | |
| 'noise_level_defaults': { | |
| 'terrier': 'high', | |
| 'hound': 'high', | |
| 'herding': 'moderate', | |
| 'toy': 'moderate', | |
| 'working': 'moderate', | |
| 'sporting': 'moderate', | |
| 'non_sporting': 'low', | |
| 'unknown': 'moderate' | |
| }, | |
| 'exercise_level_defaults': { | |
| 'working': 'high', | |
| 'sporting': 'high', | |
| 'herding': 'high', | |
| 'terrier': 'moderate', | |
| 'hound': 'moderate', | |
| 'toy': 'low', | |
| 'non_sporting': 'moderate', | |
| 'unknown': 'moderate' | |
| } | |
| }, | |
| debug_mode=False, | |
| version="1.0.0" | |
| ) | |
| def _initialize_breed_standardization(self): | |
| """初始化品種標準化""" | |
| try: | |
| # 獲取所有品種 | |
| breeds = self._get_all_breeds() | |
| for breed in breeds: | |
| standardized = self._standardize_breed_data(breed) | |
| self.breed_standardization[breed] = standardized | |
| # 建立別名映射 | |
| for alias in standardized.aliases: | |
| self.breed_aliases[alias.lower()] = breed | |
| print(f"Initialized standardization for {len(self.breed_standardization)} breeds") | |
| except Exception as e: | |
| print(f"Error initializing breed standardization: {str(e)}") | |
| print(traceback.format_exc()) | |
| def _get_all_breeds(self) -> List[str]: | |
| """獲取所有品種清單""" | |
| try: | |
| conn = sqlite3.connect('animal_detector.db') | |
| cursor = conn.cursor() | |
| cursor.execute("SELECT DISTINCT Breed FROM AnimalCatalog") | |
| breeds = [row[0] for row in cursor.fetchall()] | |
| cursor.close() | |
| conn.close() | |
| return breeds | |
| except Exception as e: | |
| print(f"Error getting breed list: {str(e)}") | |
| return [] | |
| def _standardize_breed_data(self, breed: str) -> BreedStandardization: | |
| """標準化品種資料""" | |
| try: | |
| # 基本資訊 | |
| breed_info = get_dog_description(breed) or {} | |
| health_info = breed_health_info.get(breed, {}) | |
| noise_info = breed_noise_info.get(breed, {}) | |
| # 建立標準化結構 | |
| canonical_name = breed | |
| display_name = breed.replace('_', ' ') | |
| aliases = self._generate_breed_aliases(breed) | |
| # 標準化分類數據 | |
| size_category = self._standardize_size(breed_info.get('Size', '')) | |
| exercise_level = self._standardize_exercise_needs(breed_info.get('Exercise Needs', '')) | |
| noise_level = self._standardize_noise_level(noise_info.get('noise_level', '')) | |
| care_complexity = self._standardize_care_level(breed_info.get('Care Level', '')) | |
| child_compatibility = self._standardize_child_compatibility( | |
| breed_info.get('Good with Children', '') | |
| ) | |
| # 評估資料品質 | |
| data_quality_scores = self._assess_data_quality(breed_info, health_info, noise_info) | |
| confidence_flags = self._calculate_confidence_flags(breed_info, health_info, noise_info) | |
| return BreedStandardization( | |
| canonical_name=canonical_name, | |
| display_name=display_name, | |
| aliases=aliases, | |
| size_category=size_category, | |
| exercise_level=exercise_level, | |
| noise_level=noise_level, | |
| care_complexity=care_complexity, | |
| child_compatibility=child_compatibility, | |
| data_quality_scores=data_quality_scores, | |
| confidence_flags=confidence_flags | |
| ) | |
| except Exception as e: | |
| print(f"Error standardizing breed {breed}: {str(e)}") | |
| return BreedStandardization( | |
| canonical_name=breed, | |
| display_name=breed.replace('_', ' '), | |
| aliases=self._generate_breed_aliases(breed) | |
| ) | |
| def _generate_breed_aliases(self, breed: str) -> List[str]: | |
| """生成品種別名""" | |
| aliases = [] | |
| display_name = breed.replace('_', ' ') | |
| # 基本別名 | |
| aliases.append(display_name.lower()) | |
| aliases.append(breed.lower()) | |
| # 常見縮寫和變體 | |
| breed_aliases_map = { | |
| 'German_Shepherd': ['gsd', 'german shepherd dog', 'alsatian'], | |
| 'Labrador_Retriever': ['lab', 'labrador', 'retriever'], | |
| 'Golden_Retriever': ['golden', 'goldie'], | |
| 'Border_Collie': ['border', 'collie'], | |
| 'Yorkshire_Terrier': ['yorkie', 'york', 'yorkshire'], | |
| 'French_Bulldog': ['frenchie', 'french bull', 'bouledogue français'], | |
| 'Boston_Terrier': ['boston bull', 'american gentleman'], | |
| 'Cavalier_King_Charles_Spaniel': ['cavalier', 'ckcs', 'king charles'], | |
| 'American_Staffordshire_Terrier': ['amstaff', 'american staff'], | |
| 'Jack_Russell_Terrier': ['jrt', 'jack russell', 'parson russell'], | |
| 'Shih_Tzu': ['shih tzu', 'lion dog'], | |
| 'Bichon_Frise': ['bichon', 'powder puff'], | |
| 'Cocker_Spaniel': ['cocker', 'english cocker', 'american cocker'] | |
| } | |
| if breed in breed_aliases_map: | |
| aliases.extend(breed_aliases_map[breed]) | |
| # 移除重複 | |
| return list(set(aliases)) | |
| def _standardize_size(self, size_str: str) -> int: | |
| """標準化體型分類""" | |
| size_mapping = { | |
| 'tiny': 1, 'toy': 1, | |
| 'small': 2, 'little': 2, 'compact': 2, | |
| 'medium': 3, 'moderate': 3, 'average': 3, | |
| 'large': 4, 'big': 4, | |
| 'giant': 5, 'huge': 5, 'extra large': 5 | |
| } | |
| size_lower = size_str.lower() | |
| for key, value in size_mapping.items(): | |
| if key in size_lower: | |
| return value | |
| return 3 # 預設為 medium | |
| def _standardize_exercise_needs(self, exercise_str: str) -> int: | |
| """標準化運動需求""" | |
| exercise_mapping = { | |
| 'low': 1, 'minimal': 1, 'light': 1, | |
| 'moderate': 2, 'average': 2, 'medium': 2, 'regular': 2, | |
| 'high': 3, 'active': 3, 'vigorous': 3, | |
| 'very high': 4, 'extreme': 4, 'intense': 4 | |
| } | |
| exercise_lower = exercise_str.lower() | |
| for key, value in exercise_mapping.items(): | |
| if key in exercise_lower: | |
| return value | |
| return 2 # 預設為 moderate | |
| def _standardize_noise_level(self, noise_str: str) -> int: | |
| """標準化噪音水平""" | |
| noise_mapping = { | |
| 'low': 1, 'quiet': 1, 'silent': 1, 'minimal': 1, | |
| 'moderate': 2, 'average': 2, 'medium': 2, 'occasional': 2, | |
| 'high': 3, 'loud': 3, 'vocal': 3, 'frequent': 3 | |
| } | |
| noise_lower = noise_str.lower() | |
| for key, value in noise_mapping.items(): | |
| if key in noise_lower: | |
| return value | |
| return 2 # 預設為 moderate | |
| def _standardize_care_level(self, care_str: str) -> int: | |
| """標準化護理複雜度""" | |
| care_mapping = { | |
| 'low': 1, 'easy': 1, 'simple': 1, 'minimal': 1, | |
| 'moderate': 2, 'average': 2, 'medium': 2, 'regular': 2, | |
| 'high': 3, 'complex': 3, 'intensive': 3, 'demanding': 3 | |
| } | |
| care_lower = care_str.lower() | |
| for key, value in care_mapping.items(): | |
| if key in care_lower: | |
| return value | |
| return 2 # 預設為 moderate | |
| def _standardize_child_compatibility(self, child_str: str) -> float: | |
| """標準化兒童相容性""" | |
| if child_str.lower() == 'yes': | |
| return 1.0 | |
| elif child_str.lower() == 'no': | |
| return 0.0 | |
| else: | |
| return 0.5 # 未知或不確定 | |
| def _assess_data_quality(self, breed_info: Dict, health_info: Dict, | |
| noise_info: Dict) -> Dict[str, DataQuality]: | |
| """評估資料品質""" | |
| quality_scores = {} | |
| # 基本資訊品質 | |
| if breed_info: | |
| required_fields = ['Size', 'Exercise Needs', 'Temperament', 'Good with Children'] | |
| complete_fields = sum(1 for field in required_fields if breed_info.get(field)) | |
| if complete_fields >= 4: | |
| quality_scores['basic_info'] = DataQuality.HIGH | |
| elif complete_fields >= 2: | |
| quality_scores['basic_info'] = DataQuality.MEDIUM | |
| else: | |
| quality_scores['basic_info'] = DataQuality.LOW | |
| else: | |
| quality_scores['basic_info'] = DataQuality.UNKNOWN | |
| # 健康資訊品質 | |
| if health_info and health_info.get('health_notes'): | |
| quality_scores['health_info'] = DataQuality.HIGH | |
| elif health_info: | |
| quality_scores['health_info'] = DataQuality.MEDIUM | |
| else: | |
| quality_scores['health_info'] = DataQuality.UNKNOWN | |
| # 噪音資訊品質 | |
| if noise_info and noise_info.get('noise_level'): | |
| quality_scores['noise_info'] = DataQuality.HIGH | |
| else: | |
| quality_scores['noise_info'] = DataQuality.LOW | |
| return quality_scores | |
| def _calculate_confidence_flags(self, breed_info: Dict, health_info: Dict, | |
| noise_info: Dict) -> Dict[str, float]: | |
| """計算信心度標記""" | |
| confidence_flags = {} | |
| # 基本資訊信心度 | |
| basic_confidence = 0.8 if breed_info else 0.2 | |
| if breed_info and breed_info.get('Description'): | |
| basic_confidence += 0.1 | |
| confidence_flags['basic_info'] = min(1.0, basic_confidence) | |
| # 健康資訊信心度 | |
| health_confidence = 0.7 if health_info else 0.3 | |
| confidence_flags['health_info'] = health_confidence | |
| # 噪音資訊信心度 | |
| noise_confidence = 0.8 if noise_info else 0.4 | |
| confidence_flags['noise_info'] = noise_confidence | |
| # 整體信心度 | |
| confidence_flags['overall'] = np.mean(list(confidence_flags.values())) | |
| return confidence_flags | |
| def get_standardized_breed_data(self, breed: str) -> Optional[BreedStandardization]: | |
| """獲取標準化品種資料""" | |
| # 嘗試直接匹配 | |
| if breed in self.breed_standardization: | |
| return self.breed_standardization[breed] | |
| # 嘗試別名匹配 | |
| breed_lower = breed.lower() | |
| if breed_lower in self.breed_aliases: | |
| canonical_breed = self.breed_aliases[breed_lower] | |
| return self.breed_standardization.get(canonical_breed) | |
| # 模糊匹配 | |
| for alias, canonical_breed in self.breed_aliases.items(): | |
| if breed_lower in alias or alias in breed_lower: | |
| return self.breed_standardization.get(canonical_breed) | |
| return None | |
| def apply_data_imputation(self, breed: str) -> BreedStandardization: | |
| """應用資料插補規則""" | |
| try: | |
| standardized = self.get_standardized_breed_data(breed) | |
| if not standardized: | |
| return BreedStandardization(canonical_name=breed, display_name=breed.replace('_', ' ')) | |
| imputation_rules = self.configuration.data_imputation_rules | |
| # 噪音水平插補 | |
| if standardized.noise_level == 2: # moderate (可能是預設值) | |
| breed_group = self._determine_breed_group(breed) | |
| noise_defaults = imputation_rules.get('noise_level_defaults', {}) | |
| if breed_group in noise_defaults: | |
| imputed_noise = self._standardize_noise_level(noise_defaults[breed_group]) | |
| standardized.noise_level = imputed_noise | |
| standardized.confidence_flags['noise_info'] *= 0.7 # 降低信心度 | |
| # 運動需求插補 | |
| if standardized.exercise_level == 2: # moderate (可能是預設值) | |
| breed_group = self._determine_breed_group(breed) | |
| exercise_defaults = imputation_rules.get('exercise_level_defaults', {}) | |
| if breed_group in exercise_defaults: | |
| imputed_exercise = self._standardize_exercise_needs(exercise_defaults[breed_group]) | |
| standardized.exercise_level = imputed_exercise | |
| standardized.confidence_flags['basic_info'] *= 0.8 # 降低信心度 | |
| return standardized | |
| except Exception as e: | |
| print(f"Error applying data imputation for {breed}: {str(e)}") | |
| return self.get_standardized_breed_data(breed) or BreedStandardization( | |
| canonical_name=breed, display_name=breed.replace('_', ' ') | |
| ) | |
| def _determine_breed_group(self, breed: str) -> str: | |
| """確定品種群組""" | |
| breed_lower = breed.lower() | |
| if 'terrier' in breed_lower: | |
| return 'terrier' | |
| elif 'hound' in breed_lower: | |
| return 'hound' | |
| elif any(word in breed_lower for word in ['shepherd', 'collie', 'cattle', 'sheepdog']): | |
| return 'herding' | |
| elif any(word in breed_lower for word in ['retriever', 'pointer', 'setter', 'spaniel']): | |
| return 'sporting' | |
| elif any(word in breed_lower for word in ['mastiff', 'great', 'rottweiler', 'akita']): | |
| return 'working' | |
| elif any(word in breed_lower for word in ['toy', 'pug', 'chihuahua', 'papillon']): | |
| return 'toy' | |
| else: | |
| return 'unknown' | |
| def _load_configuration(self): | |
| """載入配置檔案""" | |
| try: | |
| with open(self.config_file, 'r', encoding='utf-8') as f: | |
| config_data = json.load(f) | |
| # 更新配置 | |
| if 'scoring_weights' in config_data: | |
| self.configuration.scoring_weights.update(config_data['scoring_weights']) | |
| if 'calibration_settings' in config_data: | |
| self.configuration.calibration_settings.update(config_data['calibration_settings']) | |
| if 'constraint_thresholds' in config_data: | |
| self.configuration.constraint_thresholds.update(config_data['constraint_thresholds']) | |
| if 'semantic_model_config' in config_data: | |
| self.configuration.semantic_model_config.update(config_data['semantic_model_config']) | |
| if 'data_imputation_rules' in config_data: | |
| self.configuration.data_imputation_rules.update(config_data['data_imputation_rules']) | |
| if 'debug_mode' in config_data: | |
| self.configuration.debug_mode = config_data['debug_mode'] | |
| print(f"Configuration loaded from {self.config_file}") | |
| except Exception as e: | |
| print(f"Error loading configuration: {str(e)}") | |
| def save_configuration(self): | |
| """儲存配置檔案""" | |
| try: | |
| config_data = asdict(self.configuration) | |
| with open(self.config_file, 'w', encoding='utf-8') as f: | |
| json.dump(config_data, f, indent=2, ensure_ascii=False) | |
| print(f"Configuration saved to {self.config_file}") | |
| except Exception as e: | |
| print(f"Error saving configuration: {str(e)}") | |
| def get_configuration(self) -> ConfigurationSettings: | |
| """獲取當前配置""" | |
| return self.configuration | |
| def update_configuration(self, updates: Dict[str, Any]): | |
| """更新配置""" | |
| try: | |
| for key, value in updates.items(): | |
| if hasattr(self.configuration, key): | |
| setattr(self.configuration, key, value) | |
| print(f"Configuration updated: {list(updates.keys())}") | |
| except Exception as e: | |
| print(f"Error updating configuration: {str(e)}") | |
| def get_breed_mapping_summary(self) -> Dict[str, Any]: | |
| """獲取品種映射摘要""" | |
| try: | |
| total_breeds = len(self.breed_standardization) | |
| total_aliases = len(self.breed_aliases) | |
| # 資料品質分布 | |
| quality_distribution = {} | |
| for breed_data in self.breed_standardization.values(): | |
| for category, quality in breed_data.data_quality_scores.items(): | |
| if category not in quality_distribution: | |
| quality_distribution[category] = {} | |
| quality_name = quality.value | |
| quality_distribution[category][quality_name] = ( | |
| quality_distribution[category].get(quality_name, 0) + 1 | |
| ) | |
| # 信心度統計 | |
| confidence_stats = {} | |
| for breed_data in self.breed_standardization.values(): | |
| for category, confidence in breed_data.confidence_flags.items(): | |
| if category not in confidence_stats: | |
| confidence_stats[category] = [] | |
| confidence_stats[category].append(confidence) | |
| confidence_averages = { | |
| category: np.mean(values) for category, values in confidence_stats.items() | |
| } | |
| return { | |
| 'total_breeds': total_breeds, | |
| 'total_aliases': total_aliases, | |
| 'quality_distribution': quality_distribution, | |
| 'confidence_averages': confidence_averages, | |
| 'configuration_version': self.configuration.version | |
| } | |
| except Exception as e: | |
| print(f"Error generating breed mapping summary: {str(e)}") | |
| return {'error': str(e)} | |
| _config_manager = None | |
| def get_config_manager() -> ConfigManager: | |
| """獲取全局配置管理器""" | |
| global _config_manager | |
| if _config_manager is None: | |
| _config_manager = ConfigManager() | |
| return _config_manager | |
| def get_standardized_breed_data(breed: str) -> Optional[BreedStandardization]: | |
| """獲取標準化品種資料""" | |
| manager = get_config_manager() | |
| return manager.get_standardized_breed_data(breed) | |
| def get_breed_with_imputation(breed: str) -> BreedStandardization: | |
| """獲取應用補進後的品種資料""" | |
| manager = get_config_manager() | |
| return manager.apply_data_imputation(breed) | |
| def get_system_configuration() -> ConfigurationSettings: | |
| """系統配置""" | |
| manager = get_config_manager() | |
| return manager.get_configuration() | |