Spaces:
Sleeping
Sleeping
import numpy as np | |
import logging | |
logger = logging.getLogger(__name__) | |
def calculate_final_score( | |
quality_score: float, | |
aesthetics_score: float, | |
prompt_score: float, | |
ai_detection_score: float, | |
has_prompt: bool = True | |
) -> float: | |
""" | |
Calculate weighted composite score for image evaluation | |
Args: | |
quality_score: Technical image quality (0-10) | |
aesthetics_score: Visual appeal score (0-10) | |
prompt_score: Prompt adherence score (0-10) | |
ai_detection_score: AI generation probability (0-1) | |
has_prompt: Whether prompt metadata is available | |
Returns: | |
Final composite score (0-10) | |
""" | |
try: | |
# Validate input scores | |
quality_score = max(0.0, min(10.0, quality_score)) | |
aesthetics_score = max(0.0, min(10.0, aesthetics_score)) | |
prompt_score = max(0.0, min(10.0, prompt_score)) | |
ai_detection_score = max(0.0, min(1.0, ai_detection_score)) | |
if has_prompt: | |
# Standard weights when prompt is available | |
weights = { | |
'quality': 0.25, # 25% - Technical quality | |
'aesthetics': 0.35, # 35% - Visual appeal (highest weight) | |
'prompt': 0.25, # 25% - Prompt following | |
'ai_detection': 0.15 # 15% - AI detection (inverted) | |
} | |
# Calculate weighted score | |
score = ( | |
quality_score * weights['quality'] + | |
aesthetics_score * weights['aesthetics'] + | |
prompt_score * weights['prompt'] + | |
(1 - ai_detection_score) * weights['ai_detection'] | |
) | |
else: | |
# Redistribute prompt weight when no prompt available | |
weights = { | |
'quality': 0.375, # 25% + 12.5% from prompt | |
'aesthetics': 0.475, # 35% + 12.5% from prompt | |
'ai_detection': 0.15 # 15% - AI detection (inverted) | |
} | |
# Calculate weighted score without prompt | |
score = ( | |
quality_score * weights['quality'] + | |
aesthetics_score * weights['aesthetics'] + | |
(1 - ai_detection_score) * weights['ai_detection'] | |
) | |
# Ensure score is in valid range | |
final_score = max(0.0, min(10.0, score)) | |
logger.debug(f"Score calculation - Quality: {quality_score:.2f}, " | |
f"Aesthetics: {aesthetics_score:.2f}, Prompt: {prompt_score:.2f}, " | |
f"AI Detection: {ai_detection_score:.3f}, Has Prompt: {has_prompt}, " | |
f"Final: {final_score:.2f}") | |
return final_score | |
except Exception as e: | |
logger.error(f"Error calculating final score: {str(e)}") | |
return 5.0 # Default neutral score | |
def calculate_category_rankings(scores_list: list, category: str) -> list: | |
""" | |
Calculate rankings for a specific category | |
Args: | |
scores_list: List of score dictionaries | |
category: Category to rank by ('quality_score', 'aesthetics_score', etc.) | |
Returns: | |
List of rankings (1-based) | |
""" | |
try: | |
if not scores_list or category not in scores_list[0]: | |
return [1] * len(scores_list) | |
# Extract scores for the category | |
category_scores = [item[category] for item in scores_list] | |
# Calculate rankings (higher score = better rank) | |
rankings = [] | |
for i, score in enumerate(category_scores): | |
rank = 1 | |
for j, other_score in enumerate(category_scores): | |
if other_score > score: | |
rank += 1 | |
rankings.append(rank) | |
return rankings | |
except Exception as e: | |
logger.error(f"Error calculating category rankings: {str(e)}") | |
return list(range(1, len(scores_list) + 1)) | |
def normalize_scores(scores: list, target_range: tuple = (0, 10)) -> list: | |
""" | |
Normalize a list of scores to a target range | |
Args: | |
scores: List of numerical scores | |
target_range: Tuple of (min, max) for target range | |
Returns: | |
List of normalized scores | |
""" | |
try: | |
if not scores: | |
return [] | |
min_score = min(scores) | |
max_score = max(scores) | |
# Avoid division by zero | |
if max_score == min_score: | |
return [target_range[1]] * len(scores) | |
target_min, target_max = target_range | |
target_span = target_max - target_min | |
score_span = max_score - min_score | |
normalized = [] | |
for score in scores: | |
normalized_score = target_min + (score - min_score) * target_span / score_span | |
normalized.append(max(target_min, min(target_max, normalized_score))) | |
return normalized | |
except Exception as e: | |
logger.error(f"Error normalizing scores: {str(e)}") | |
return scores | |
def calculate_confidence_intervals(scores: list, confidence_level: float = 0.95) -> dict: | |
""" | |
Calculate confidence intervals for a list of scores | |
Args: | |
scores: List of numerical scores | |
confidence_level: Confidence level (0-1) | |
Returns: | |
Dictionary with mean, std, lower_bound, upper_bound | |
""" | |
try: | |
if not scores: | |
return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0} | |
mean_score = np.mean(scores) | |
std_score = np.std(scores) | |
# Calculate confidence interval using t-distribution | |
from scipy import stats | |
n = len(scores) | |
t_value = stats.t.ppf((1 + confidence_level) / 2, n - 1) | |
margin_error = t_value * std_score / np.sqrt(n) | |
return { | |
'mean': float(mean_score), | |
'std': float(std_score), | |
'lower_bound': float(mean_score - margin_error), | |
'upper_bound': float(mean_score + margin_error) | |
} | |
except Exception as e: | |
logger.error(f"Error calculating confidence intervals: {str(e)}") | |
return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0} | |
def detect_outliers(scores: list, method: str = 'iqr') -> list: | |
""" | |
Detect outliers in a list of scores | |
Args: | |
scores: List of numerical scores | |
method: Method to use ('iqr', 'zscore', 'modified_zscore') | |
Returns: | |
List of boolean values indicating outliers | |
""" | |
try: | |
if not scores or len(scores) < 3: | |
return [False] * len(scores) | |
scores_array = np.array(scores) | |
if method == 'iqr': | |
# Interquartile Range method | |
q1 = np.percentile(scores_array, 25) | |
q3 = np.percentile(scores_array, 75) | |
iqr = q3 - q1 | |
lower_bound = q1 - 1.5 * iqr | |
upper_bound = q3 + 1.5 * iqr | |
outliers = (scores_array < lower_bound) | (scores_array > upper_bound) | |
elif method == 'zscore': | |
# Z-score method | |
z_scores = np.abs(stats.zscore(scores_array)) | |
outliers = z_scores > 2.5 | |
elif method == 'modified_zscore': | |
# Modified Z-score method (more robust) | |
median = np.median(scores_array) | |
mad = np.median(np.abs(scores_array - median)) | |
modified_z_scores = 0.6745 * (scores_array - median) / mad | |
outliers = np.abs(modified_z_scores) > 3.5 | |
else: | |
outliers = [False] * len(scores) | |
return outliers.tolist() | |
except Exception as e: | |
logger.error(f"Error detecting outliers: {str(e)}") | |
return [False] * len(scores) | |
def calculate_score_distribution(scores: list) -> dict: | |
""" | |
Calculate distribution statistics for scores | |
Args: | |
scores: List of numerical scores | |
Returns: | |
Dictionary with distribution statistics | |
""" | |
try: | |
if not scores: | |
return {} | |
scores_array = np.array(scores) | |
distribution = { | |
'count': len(scores), | |
'mean': float(np.mean(scores_array)), | |
'median': float(np.median(scores_array)), | |
'std': float(np.std(scores_array)), | |
'min': float(np.min(scores_array)), | |
'max': float(np.max(scores_array)), | |
'q1': float(np.percentile(scores_array, 25)), | |
'q3': float(np.percentile(scores_array, 75)), | |
'skewness': float(stats.skew(scores_array)), | |
'kurtosis': float(stats.kurtosis(scores_array)) | |
} | |
return distribution | |
except Exception as e: | |
logger.error(f"Error calculating score distribution: {str(e)}") | |
return {} | |
def apply_score_adjustments( | |
scores: dict, | |
adjustments: dict = None | |
) -> dict: | |
""" | |
Apply custom score adjustments based on specific criteria | |
Args: | |
scores: Dictionary of scores | |
adjustments: Dictionary of adjustment parameters | |
Returns: | |
Dictionary of adjusted scores | |
""" | |
try: | |
if adjustments is None: | |
adjustments = {} | |
adjusted_scores = scores.copy() | |
# Apply anime mode adjustments | |
if adjustments.get('anime_mode', False): | |
# Boost aesthetics score for anime images | |
if 'aesthetics_score' in adjusted_scores: | |
adjusted_scores['aesthetics_score'] *= 1.1 | |
adjusted_scores['aesthetics_score'] = min(10.0, adjusted_scores['aesthetics_score']) | |
# Apply quality penalties for low resolution | |
if adjustments.get('penalize_low_resolution', True): | |
width = adjustments.get('width', 1024) | |
height = adjustments.get('height', 1024) | |
total_pixels = width * height | |
if total_pixels < 262144: # Less than 512x512 | |
penalty = 0.8 | |
if 'quality_score' in adjusted_scores: | |
adjusted_scores['quality_score'] *= penalty | |
# Apply prompt complexity adjustments | |
prompt_length = adjustments.get('prompt_length', 0) | |
if prompt_length > 0 and 'prompt_score' in adjusted_scores: | |
if prompt_length > 100: # Very long prompts are harder to follow | |
adjusted_scores['prompt_score'] *= 0.95 | |
elif prompt_length < 10: # Very short prompts are easier | |
adjusted_scores['prompt_score'] *= 1.05 | |
adjusted_scores['prompt_score'] = min(10.0, adjusted_scores['prompt_score']) | |
return adjusted_scores | |
except Exception as e: | |
logger.error(f"Error applying score adjustments: {str(e)}") | |
return scores | |
def generate_score_summary(results_list: list) -> dict: | |
""" | |
Generate summary statistics for a batch of evaluation results | |
Args: | |
results_list: List of result dictionaries | |
Returns: | |
Dictionary with summary statistics | |
""" | |
try: | |
if not results_list: | |
return {} | |
# Extract scores by category | |
categories = ['quality_score', 'aesthetics_score', 'prompt_score', 'ai_detection_score', 'final_score'] | |
summary = {} | |
for category in categories: | |
if category in results_list[0]: | |
scores = [result[category] for result in results_list if category in result] | |
if scores: | |
summary[category] = calculate_score_distribution(scores) | |
# Calculate overall statistics | |
final_scores = [result['final_score'] for result in results_list if 'final_score' in result] | |
if final_scores: | |
summary['overall'] = { | |
'total_images': len(results_list), | |
'average_score': np.mean(final_scores), | |
'best_score': max(final_scores), | |
'worst_score': min(final_scores), | |
'score_range': max(final_scores) - min(final_scores), | |
'images_with_prompts': sum(1 for r in results_list if r.get('has_prompt', False)) | |
} | |
return summary | |
except Exception as e: | |
logger.error(f"Error generating score summary: {str(e)}") | |
return {} | |