""" Scoring Module Handles normalization and composite scoring for SQL evaluation results. """ import math import numpy as np from typing import Dict, Any, List from dataclasses import dataclass @dataclass class Metrics: """Evaluation metrics for a SQL query.""" correctness_exact: float # 0.0 or 1.0 result_match_f1: float # 0.0 to 1.0 exec_success: float # 0.0 or 1.0 latency_ms: float # milliseconds readability: float # 0.0 to 1.0 (based on SQL structure) dialect_ok: float # 0.0 or 1.0 class ScoringEngine: """Engine for computing composite scores from evaluation metrics.""" def __init__(self): # Weights for composite scoring (sum should be 1.0) self.weights = { 'correctness_exact': 0.4, # Most important 'exec_success': 0.25, # Very important 'result_match_f1': 0.15, # Important for partial credit 'dialect_ok': 0.1, # Important for dialect compliance 'readability': 0.05, # Minor factor 'latency': 0.05 # Minor factor (normalized) } # Latency normalization parameters self.latency_min_ms = 10.0 # Minimum expected latency self.latency_max_ms = 10000.0 # Maximum expected latency def normalize_latency(self, latency_ms: float) -> float: """Normalize latency using log scale.""" if latency_ms <= 0: return 0.0 # Clamp to reasonable bounds latency_ms = max(self.latency_min_ms, min(latency_ms, self.latency_max_ms)) # Log normalization: log(latency) / log(max_latency) normalized = math.log(latency_ms) / math.log(self.latency_max_ms) # Invert so lower latency = higher score return 1.0 - normalized def compute_readability_score(self, sql: str) -> float: """Compute readability score based on SQL structure.""" if not sql or not sql.strip(): return 0.0 sql = sql.strip().upper() score = 0.0 # Basic structure checks if 'SELECT' in sql: score += 0.2 if 'FROM' in sql: score += 0.2 if sql.count('(') == sql.count(')'): # Balanced parentheses score += 0.1 # Formatting checks if '\n' in sql: # Multi-line formatting score += 0.1 if sql.count(' ') > 5: # Proper spacing score += 0.1 # Complexity checks (more complex = slightly lower readability) complexity_penalty = 0.0 if sql.count('JOIN') > 2: complexity_penalty += 0.1 if sql.count('CASE') > 0: complexity_penalty += 0.05 if sql.count('(') > 3: complexity_penalty += 0.05 score = max(0.0, score - complexity_penalty) return min(1.0, score) def compute_composite_score(self, metrics: Metrics) -> float: """Compute composite score from individual metrics.""" # Normalize latency normalized_latency = self.normalize_latency(metrics.latency_ms) # Compute readability if not provided if metrics.readability == 0.0: # This would need the actual SQL, but for now we'll use a default metrics.readability = 0.8 # Default reasonable readability # Weighted sum composite_score = ( self.weights['correctness_exact'] * metrics.correctness_exact + self.weights['exec_success'] * metrics.exec_success + self.weights['result_match_f1'] * metrics.result_match_f1 + self.weights['dialect_ok'] * metrics.dialect_ok + self.weights['readability'] * metrics.readability + self.weights['latency'] * normalized_latency ) return round(composite_score, 4) def compute_composite_score_from_dict(self, metrics_dict: Dict[str, Any]) -> float: """Compute composite score from metrics dictionary.""" metrics = Metrics( correctness_exact=metrics_dict.get('correctness_exact', 0.0), result_match_f1=metrics_dict.get('result_match_f1', 0.0), exec_success=metrics_dict.get('exec_success', 0.0), latency_ms=metrics_dict.get('latency_ms', 0.0), readability=metrics_dict.get('readability', 0.0), dialect_ok=metrics_dict.get('dialect_ok', 0.0) ) return self.compute_composite_score(metrics) def get_score_breakdown(self, metrics: Metrics) -> Dict[str, float]: """Get detailed breakdown of how the composite score was computed.""" normalized_latency = self.normalize_latency(metrics.latency_ms) breakdown = { 'correctness_exact': self.weights['correctness_exact'] * metrics.correctness_exact, 'exec_success': self.weights['exec_success'] * metrics.exec_success, 'result_match_f1': self.weights['result_match_f1'] * metrics.result_match_f1, 'dialect_ok': self.weights['dialect_ok'] * metrics.dialect_ok, 'readability': self.weights['readability'] * metrics.readability, 'latency': self.weights['latency'] * normalized_latency, 'composite_score': self.compute_composite_score(metrics) } return breakdown # Global scoring engine instance scoring_engine = ScoringEngine()