import numpy as np import logging logger = logging.getLogger(__name__) def calculate_final_score( quality_score: float, aesthetics_score: float, prompt_score: float, ai_detection_score: float, has_prompt: bool = True ) -> float: """ Calculate weighted composite score for image evaluation Args: quality_score: Technical image quality (0-10) aesthetics_score: Visual appeal score (0-10) prompt_score: Prompt adherence score (0-10) ai_detection_score: AI generation probability (0-1) has_prompt: Whether prompt metadata is available Returns: Final composite score (0-10) """ try: # Validate input scores quality_score = max(0.0, min(10.0, quality_score)) aesthetics_score = max(0.0, min(10.0, aesthetics_score)) prompt_score = max(0.0, min(10.0, prompt_score)) ai_detection_score = max(0.0, min(1.0, ai_detection_score)) if has_prompt: # Standard weights when prompt is available weights = { 'quality': 0.25, # 25% - Technical quality 'aesthetics': 0.35, # 35% - Visual appeal (highest weight) 'prompt': 0.25, # 25% - Prompt following 'ai_detection': 0.15 # 15% - AI detection (inverted) } # Calculate weighted score score = ( quality_score * weights['quality'] + aesthetics_score * weights['aesthetics'] + prompt_score * weights['prompt'] + (1 - ai_detection_score) * weights['ai_detection'] ) else: # Redistribute prompt weight when no prompt available weights = { 'quality': 0.375, # 25% + 12.5% from prompt 'aesthetics': 0.475, # 35% + 12.5% from prompt 'ai_detection': 0.15 # 15% - AI detection (inverted) } # Calculate weighted score without prompt score = ( quality_score * weights['quality'] + aesthetics_score * weights['aesthetics'] + (1 - ai_detection_score) * weights['ai_detection'] ) # Ensure score is in valid range final_score = max(0.0, min(10.0, score)) logger.debug(f"Score calculation - Quality: {quality_score:.2f}, " f"Aesthetics: {aesthetics_score:.2f}, Prompt: {prompt_score:.2f}, " f"AI Detection: {ai_detection_score:.3f}, Has Prompt: {has_prompt}, " f"Final: {final_score:.2f}") return final_score except Exception as e: logger.error(f"Error calculating final score: {str(e)}") return 5.0 # Default neutral score def calculate_category_rankings(scores_list: list, category: str) -> list: """ Calculate rankings for a specific category Args: scores_list: List of score dictionaries category: Category to rank by ('quality_score', 'aesthetics_score', etc.) Returns: List of rankings (1-based) """ try: if not scores_list or category not in scores_list[0]: return [1] * len(scores_list) # Extract scores for the category category_scores = [item[category] for item in scores_list] # Calculate rankings (higher score = better rank) rankings = [] for i, score in enumerate(category_scores): rank = 1 for j, other_score in enumerate(category_scores): if other_score > score: rank += 1 rankings.append(rank) return rankings except Exception as e: logger.error(f"Error calculating category rankings: {str(e)}") return list(range(1, len(scores_list) + 1)) def normalize_scores(scores: list, target_range: tuple = (0, 10)) -> list: """ Normalize a list of scores to a target range Args: scores: List of numerical scores target_range: Tuple of (min, max) for target range Returns: List of normalized scores """ try: if not scores: return [] min_score = min(scores) max_score = max(scores) # Avoid division by zero if max_score == min_score: return [target_range[1]] * len(scores) target_min, target_max = target_range target_span = target_max - target_min score_span = max_score - min_score normalized = [] for score in scores: normalized_score = target_min + (score - min_score) * target_span / score_span normalized.append(max(target_min, min(target_max, normalized_score))) return normalized except Exception as e: logger.error(f"Error normalizing scores: {str(e)}") return scores def calculate_confidence_intervals(scores: list, confidence_level: float = 0.95) -> dict: """ Calculate confidence intervals for a list of scores Args: scores: List of numerical scores confidence_level: Confidence level (0-1) Returns: Dictionary with mean, std, lower_bound, upper_bound """ try: if not scores: return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0} mean_score = np.mean(scores) std_score = np.std(scores) # Calculate confidence interval using t-distribution from scipy import stats n = len(scores) t_value = stats.t.ppf((1 + confidence_level) / 2, n - 1) margin_error = t_value * std_score / np.sqrt(n) return { 'mean': float(mean_score), 'std': float(std_score), 'lower_bound': float(mean_score - margin_error), 'upper_bound': float(mean_score + margin_error) } except Exception as e: logger.error(f"Error calculating confidence intervals: {str(e)}") return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0} def detect_outliers(scores: list, method: str = 'iqr') -> list: """ Detect outliers in a list of scores Args: scores: List of numerical scores method: Method to use ('iqr', 'zscore', 'modified_zscore') Returns: List of boolean values indicating outliers """ try: if not scores or len(scores) < 3: return [False] * len(scores) scores_array = np.array(scores) if method == 'iqr': # Interquartile Range method q1 = np.percentile(scores_array, 25) q3 = np.percentile(scores_array, 75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outliers = (scores_array < lower_bound) | (scores_array > upper_bound) elif method == 'zscore': # Z-score method z_scores = np.abs(stats.zscore(scores_array)) outliers = z_scores > 2.5 elif method == 'modified_zscore': # Modified Z-score method (more robust) median = np.median(scores_array) mad = np.median(np.abs(scores_array - median)) modified_z_scores = 0.6745 * (scores_array - median) / mad outliers = np.abs(modified_z_scores) > 3.5 else: outliers = [False] * len(scores) return outliers.tolist() except Exception as e: logger.error(f"Error detecting outliers: {str(e)}") return [False] * len(scores) def calculate_score_distribution(scores: list) -> dict: """ Calculate distribution statistics for scores Args: scores: List of numerical scores Returns: Dictionary with distribution statistics """ try: if not scores: return {} scores_array = np.array(scores) distribution = { 'count': len(scores), 'mean': float(np.mean(scores_array)), 'median': float(np.median(scores_array)), 'std': float(np.std(scores_array)), 'min': float(np.min(scores_array)), 'max': float(np.max(scores_array)), 'q1': float(np.percentile(scores_array, 25)), 'q3': float(np.percentile(scores_array, 75)), 'skewness': float(stats.skew(scores_array)), 'kurtosis': float(stats.kurtosis(scores_array)) } return distribution except Exception as e: logger.error(f"Error calculating score distribution: {str(e)}") return {} def apply_score_adjustments( scores: dict, adjustments: dict = None ) -> dict: """ Apply custom score adjustments based on specific criteria Args: scores: Dictionary of scores adjustments: Dictionary of adjustment parameters Returns: Dictionary of adjusted scores """ try: if adjustments is None: adjustments = {} adjusted_scores = scores.copy() # Apply anime mode adjustments if adjustments.get('anime_mode', False): # Boost aesthetics score for anime images if 'aesthetics_score' in adjusted_scores: adjusted_scores['aesthetics_score'] *= 1.1 adjusted_scores['aesthetics_score'] = min(10.0, adjusted_scores['aesthetics_score']) # Apply quality penalties for low resolution if adjustments.get('penalize_low_resolution', True): width = adjustments.get('width', 1024) height = adjustments.get('height', 1024) total_pixels = width * height if total_pixels < 262144: # Less than 512x512 penalty = 0.8 if 'quality_score' in adjusted_scores: adjusted_scores['quality_score'] *= penalty # Apply prompt complexity adjustments prompt_length = adjustments.get('prompt_length', 0) if prompt_length > 0 and 'prompt_score' in adjusted_scores: if prompt_length > 100: # Very long prompts are harder to follow adjusted_scores['prompt_score'] *= 0.95 elif prompt_length < 10: # Very short prompts are easier adjusted_scores['prompt_score'] *= 1.05 adjusted_scores['prompt_score'] = min(10.0, adjusted_scores['prompt_score']) return adjusted_scores except Exception as e: logger.error(f"Error applying score adjustments: {str(e)}") return scores def generate_score_summary(results_list: list) -> dict: """ Generate summary statistics for a batch of evaluation results Args: results_list: List of result dictionaries Returns: Dictionary with summary statistics """ try: if not results_list: return {} # Extract scores by category categories = ['quality_score', 'aesthetics_score', 'prompt_score', 'ai_detection_score', 'final_score'] summary = {} for category in categories: if category in results_list[0]: scores = [result[category] for result in results_list if category in result] if scores: summary[category] = calculate_score_distribution(scores) # Calculate overall statistics final_scores = [result['final_score'] for result in results_list if 'final_score' in result] if final_scores: summary['overall'] = { 'total_images': len(results_list), 'average_score': np.mean(final_scores), 'best_score': max(final_scores), 'worst_score': min(final_scores), 'score_range': max(final_scores) - min(final_scores), 'images_with_prompts': sum(1 for r in results_list if r.get('has_prompt', False)) } return summary except Exception as e: logger.error(f"Error generating score summary: {str(e)}") return {}