Spaces:

VOIDER
/

image-evaluation-tool

Sleeping

App Files Files Community

image-evaluation-tool / utils /scoring.py

VOIDER

Upload 14 files

83b7522 verified 26 days ago

raw

history blame

12.5 kB

	import numpy as np
	import logging

	logger = logging.getLogger(__name__)

	def calculate_final_score(
	quality_score: float,
	aesthetics_score: float,
	prompt_score: float,
	ai_detection_score: float,
	has_prompt: bool = True
	) -> float:
	"""
	Calculate weighted composite score for image evaluation

	Args:
	quality_score: Technical image quality (0-10)
	aesthetics_score: Visual appeal score (0-10)
	prompt_score: Prompt adherence score (0-10)
	ai_detection_score: AI generation probability (0-1)
	has_prompt: Whether prompt metadata is available

	Returns:
	Final composite score (0-10)
	"""
	try:
	# Validate input scores
	quality_score = max(0.0, min(10.0, quality_score))
	aesthetics_score = max(0.0, min(10.0, aesthetics_score))
	prompt_score = max(0.0, min(10.0, prompt_score))
	ai_detection_score = max(0.0, min(1.0, ai_detection_score))

	if has_prompt:
	# Standard weights when prompt is available
	weights = {
	'quality': 0.25, # 25% - Technical quality
	'aesthetics': 0.35, # 35% - Visual appeal (highest weight)
	'prompt': 0.25, # 25% - Prompt following
	'ai_detection': 0.15 # 15% - AI detection (inverted)
	}

	# Calculate weighted score
	score = (
	quality_score * weights['quality'] +
	aesthetics_score * weights['aesthetics'] +
	prompt_score * weights['prompt'] +
	(1 - ai_detection_score) * weights['ai_detection']
	)
	else:
	# Redistribute prompt weight when no prompt available
	weights = {
	'quality': 0.375, # 25% + 12.5% from prompt
	'aesthetics': 0.475, # 35% + 12.5% from prompt
	'ai_detection': 0.15 # 15% - AI detection (inverted)
	}

	# Calculate weighted score without prompt
	score = (
	quality_score * weights['quality'] +
	aesthetics_score * weights['aesthetics'] +
	(1 - ai_detection_score) * weights['ai_detection']
	)

	# Ensure score is in valid range
	final_score = max(0.0, min(10.0, score))

	logger.debug(f"Score calculation - Quality: {quality_score:.2f}, "
	f"Aesthetics: {aesthetics_score:.2f}, Prompt: {prompt_score:.2f}, "
	f"AI Detection: {ai_detection_score:.3f}, Has Prompt: {has_prompt}, "
	f"Final: {final_score:.2f}")

	return final_score

	except Exception as e:
	logger.error(f"Error calculating final score: {str(e)}")
	return 5.0 # Default neutral score

	def calculate_category_rankings(scores_list: list, category: str) -> list:
	"""
	Calculate rankings for a specific category

	Args:
	scores_list: List of score dictionaries
	category: Category to rank by ('quality_score', 'aesthetics_score', etc.)

	Returns:
	List of rankings (1-based)
	"""
	try:
	if not scores_list or category not in scores_list[0]:
	return [1] * len(scores_list)

	# Extract scores for the category
	category_scores = [item[category] for item in scores_list]

	# Calculate rankings (higher score = better rank)
	rankings = []
	for i, score in enumerate(category_scores):
	rank = 1
	for j, other_score in enumerate(category_scores):
	if other_score > score:
	rank += 1
	rankings.append(rank)

	return rankings

	except Exception as e:
	logger.error(f"Error calculating category rankings: {str(e)}")
	return list(range(1, len(scores_list) + 1))

	def normalize_scores(scores: list, target_range: tuple = (0, 10)) -> list:
	"""
	Normalize a list of scores to a target range

	Args:
	scores: List of numerical scores
	target_range: Tuple of (min, max) for target range

	Returns:
	List of normalized scores
	"""
	try:
	if not scores:
	return []

	min_score = min(scores)
	max_score = max(scores)

	# Avoid division by zero
	if max_score == min_score:
	return [target_range[1]] * len(scores)

	target_min, target_max = target_range
	target_span = target_max - target_min
	score_span = max_score - min_score

	normalized = []
	for score in scores:
	normalized_score = target_min + (score - min_score) * target_span / score_span
	normalized.append(max(target_min, min(target_max, normalized_score)))

	return normalized

	except Exception as e:
	logger.error(f"Error normalizing scores: {str(e)}")
	return scores

	def calculate_confidence_intervals(scores: list, confidence_level: float = 0.95) -> dict:
	"""
	Calculate confidence intervals for a list of scores

	Args:
	scores: List of numerical scores
	confidence_level: Confidence level (0-1)

	Returns:
	Dictionary with mean, std, lower_bound, upper_bound
	"""
	try:
	if not scores:
	return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0}

	mean_score = np.mean(scores)
	std_score = np.std(scores)

	# Calculate confidence interval using t-distribution
	from scipy import stats
	n = len(scores)
	t_value = stats.t.ppf((1 + confidence_level) / 2, n - 1)
	margin_error = t_value * std_score / np.sqrt(n)

	return {
	'mean': float(mean_score),
	'std': float(std_score),
	'lower_bound': float(mean_score - margin_error),
	'upper_bound': float(mean_score + margin_error)
	}

	except Exception as e:
	logger.error(f"Error calculating confidence intervals: {str(e)}")
	return {'mean': 0, 'std': 0, 'lower_bound': 0, 'upper_bound': 0}

	def detect_outliers(scores: list, method: str = 'iqr') -> list:
	"""
	Detect outliers in a list of scores

	Args:
	scores: List of numerical scores
	method: Method to use ('iqr', 'zscore', 'modified_zscore')

	Returns:
	List of boolean values indicating outliers
	"""
	try:
	if not scores or len(scores) < 3:
	return [False] * len(scores)

	scores_array = np.array(scores)

	if method == 'iqr':
	# Interquartile Range method
	q1 = np.percentile(scores_array, 25)
	q3 = np.percentile(scores_array, 75)
	iqr = q3 - q1
	lower_bound = q1 - 1.5 * iqr
	upper_bound = q3 + 1.5 * iqr
	outliers = (scores_array < lower_bound) \| (scores_array > upper_bound)

	elif method == 'zscore':
	# Z-score method
	z_scores = np.abs(stats.zscore(scores_array))
	outliers = z_scores > 2.5

	elif method == 'modified_zscore':
	# Modified Z-score method (more robust)
	median = np.median(scores_array)
	mad = np.median(np.abs(scores_array - median))
	modified_z_scores = 0.6745 * (scores_array - median) / mad
	outliers = np.abs(modified_z_scores) > 3.5

	else:
	outliers = [False] * len(scores)

	return outliers.tolist()

	except Exception as e:
	logger.error(f"Error detecting outliers: {str(e)}")
	return [False] * len(scores)

	def calculate_score_distribution(scores: list) -> dict:
	"""
	Calculate distribution statistics for scores

	Args:
	scores: List of numerical scores

	Returns:
	Dictionary with distribution statistics
	"""
	try:
	if not scores:
	return {}

	scores_array = np.array(scores)

	distribution = {
	'count': len(scores),
	'mean': float(np.mean(scores_array)),
	'median': float(np.median(scores_array)),
	'std': float(np.std(scores_array)),
	'min': float(np.min(scores_array)),
	'max': float(np.max(scores_array)),
	'q1': float(np.percentile(scores_array, 25)),
	'q3': float(np.percentile(scores_array, 75)),
	'skewness': float(stats.skew(scores_array)),
	'kurtosis': float(stats.kurtosis(scores_array))
	}

	return distribution

	except Exception as e:
	logger.error(f"Error calculating score distribution: {str(e)}")
	return {}

	def apply_score_adjustments(
	scores: dict,
	adjustments: dict = None
	) -> dict:
	"""
	Apply custom score adjustments based on specific criteria

	Args:
	scores: Dictionary of scores
	adjustments: Dictionary of adjustment parameters

	Returns:
	Dictionary of adjusted scores
	"""
	try:
	if adjustments is None:
	adjustments = {}

	adjusted_scores = scores.copy()

	# Apply anime mode adjustments
	if adjustments.get('anime_mode', False):
	# Boost aesthetics score for anime images
	if 'aesthetics_score' in adjusted_scores:
	adjusted_scores['aesthetics_score'] *= 1.1
	adjusted_scores['aesthetics_score'] = min(10.0, adjusted_scores['aesthetics_score'])

	# Apply quality penalties for low resolution
	if adjustments.get('penalize_low_resolution', True):
	width = adjustments.get('width', 1024)
	height = adjustments.get('height', 1024)
	total_pixels = width * height

	if total_pixels < 262144: # Less than 512x512
	penalty = 0.8
	if 'quality_score' in adjusted_scores:
	adjusted_scores['quality_score'] *= penalty

	# Apply prompt complexity adjustments
	prompt_length = adjustments.get('prompt_length', 0)
	if prompt_length > 0 and 'prompt_score' in adjusted_scores:
	if prompt_length > 100: # Very long prompts are harder to follow
	adjusted_scores['prompt_score'] *= 0.95
	elif prompt_length < 10: # Very short prompts are easier
	adjusted_scores['prompt_score'] *= 1.05
	adjusted_scores['prompt_score'] = min(10.0, adjusted_scores['prompt_score'])

	return adjusted_scores

	except Exception as e:
	logger.error(f"Error applying score adjustments: {str(e)}")
	return scores

	def generate_score_summary(results_list: list) -> dict:
	"""
	Generate summary statistics for a batch of evaluation results

	Args:
	results_list: List of result dictionaries

	Returns:
	Dictionary with summary statistics
	"""
	try:
	if not results_list:
	return {}

	# Extract scores by category
	categories = ['quality_score', 'aesthetics_score', 'prompt_score', 'ai_detection_score', 'final_score']
	summary = {}

	for category in categories:
	if category in results_list[0]:
	scores = [result[category] for result in results_list if category in result]
	if scores:
	summary[category] = calculate_score_distribution(scores)

	# Calculate overall statistics
	final_scores = [result['final_score'] for result in results_list if 'final_score' in result]
	if final_scores:
	summary['overall'] = {
	'total_images': len(results_list),
	'average_score': np.mean(final_scores),
	'best_score': max(final_scores),
	'worst_score': min(final_scores),
	'score_range': max(final_scores) - min(final_scores),
	'images_with_prompts': sum(1 for r in results_list if r.get('has_prompt', False))
	}

	return summary

	except Exception as e:
	logger.error(f"Error generating score summary: {str(e)}")
	return {}