Spaces:

nexar-ai
/

nexar-driving-leaderboard

Running

nexar-driving-leaderboard / comparison.py

Roni Goldshmidt

Initial leaderboard setup

436a7cd about 1 month ago

28.7 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
	import warnings
	warnings.filterwarnings("ignore", category=UserWarning, message="y_pred contains classes not in y_true")
	sns.set_style("whitegrid")

	class ModelEvaluator:
	def __init__(self, df_labels, df_predictions, model_name, categories = ['main-event', 'location', 'zone', 'light-conditions', 'weather-conditions', 'vehicles-density']):
	"""
	Initialize the evaluator with ground truth labels and model predictions.
	"""
	self.df_labels = df_labels
	self.df_predictions = df_predictions
	self.model_name = model_name
	self.categories = categories
	self.metrics_df = self.compute_metrics()

	def merge_data(self):
	"""Merge ground truth labels with predictions based on 'id'."""
	merged_df = pd.merge(self.df_labels, self.df_predictions, on='id', suffixes=('_true', '_pred'))
	for category in list(set(self.categories) - set(['main-event'])):
	valid_values = self.df_labels[f"{category}"].unique().astype(str)
	merged_df = merged_df[merged_df[f"{category}_pred"].astype(str).isin(valid_values)]

	return merged_df

	def compute_metrics(self):
	"""Compute precision, recall, F1-score, accuracy, and balanced accuracy for each class and category."""
	merged_df = self.merge_data()
	categories = self.categories

	results = []

	for category in categories:
	true_col = f"{category}_true"
	pred_col = f"{category}_pred"

	if true_col not in merged_df.columns or pred_col not in merged_df.columns:
	print(f"Skipping {category} - missing columns")
	continue

	filtered_df = merged_df[merged_df[true_col] != "unknown"]

	if filtered_df.empty:
	print(f"Skipping {category} - only 'unknown' values present.")
	continue

	y_true = filtered_df[true_col].astype(str)
	y_pred = filtered_df[pred_col].astype(str)

	valid_labels = sorted(set(y_true) \| set(y_pred))

	valid_labels = [label for label in valid_labels if (y_true == label).sum() > 0 and label != "unknown"]

	if not valid_labels:
	print(f"Skipping {category} - No valid labels found after filtering.")
	continue

	class_precisions = precision_score(y_true, y_pred, labels=valid_labels, average=None, zero_division=0)
	class_recalls = recall_score(y_true, y_pred, labels=valid_labels, average=None, zero_division=0)
	class_f1 = f1_score(y_true, y_pred, labels=valid_labels, average=None, zero_division=0)

	overall_precision = precision_score(y_true, y_pred, labels=valid_labels, average='macro', zero_division=0)
	overall_recall = recall_score(y_true, y_pred, labels=valid_labels, average='macro', zero_division=0)
	overall_f1 = f1_score(y_true, y_pred, labels=valid_labels, average='macro', zero_division=0)
	overall_accuracy = accuracy_score(y_true, y_pred)
	overall_balanced_acc = balanced_accuracy_score(y_true, y_pred)

	for i, label in enumerate(valid_labels):
	results.append({
	"Model": self.model_name,
	"Category": category,
	"Class": label,
	"Precision": class_precisions[i],
	"Recall": class_recalls[i],
	"F1 Score": class_f1[i],
	"Accuracy": np.nan,
	"Balanced Acc.": np.nan,
	"Support": (y_true == label).sum()
	})

	results.append({
	"Model": self.model_name,
	"Category": category,
	"Class": f"Overall ({category})",
	"Precision": overall_precision,
	"Recall": overall_recall,
	"F1 Score": overall_f1,
	"Accuracy": overall_accuracy,
	"Balanced Acc.": overall_balanced_acc,
	"Support": len(y_true)
	})

	df_res = pd.DataFrame(results)
	return df_res.loc[df_res['Support'] > 0].reset_index(drop=True)

	def get_metrics_df(self):
	"""Return the computed metrics DataFrame."""
	return self.metrics_df


	class ModelComparison:
	def __init__(self, evaluators):
	"""
	Compare multiple models based on their evaluation results.

	:param evaluators: List of ModelEvaluator instances.
	"""
	self.evaluators = evaluators
	self.combined_df = self.aggregate_metrics()

	def aggregate_metrics(self):
	"""Merge evaluation metrics from multiple models into a single DataFrame."""
	dfs = [evaluator.get_metrics_df() for evaluator in self.evaluators]
	return pd.concat(dfs, ignore_index=True)

	def plot_category_comparison(self, metric="F1 Score"):
	"""Compare models at the category level using a grouped bar chart with consistent styling."""
	df = self.combined_df[self.combined_df['Class'].str.contains("Overall")]

	plt.figure(figsize=(12, 6))
	colors = sns.color_palette("Set2", len(df["Model"].unique())) # Consistent palette

	ax = sns.barplot(
	data=df, x="Category", y=metric, hue="Model", palette=colors, edgecolor="black", alpha=0.85
	)

	plt.title(f"{metric} Comparison Across Categories", fontsize=14, fontweight="bold")
	plt.ylim(0, 1)
	plt.xticks(rotation=45, fontsize=12)
	plt.yticks(fontsize=12)
	plt.xlabel("Category", fontsize=12)
	plt.ylabel(metric, fontsize=12)
	plt.legend(title="Model", fontsize=11, loc="upper left")
	plt.grid(axis="y", linestyle="--", alpha=0.6)

	plt.tight_layout()
	plt.show()


	def plot_per_class_comparison(self, category, metric="F1 Score"):
	"""Compare models for a specific category across individual classes with a standardized design."""
	df = self.combined_df[(self.combined_df["Category"] == category) & (~self.combined_df["Class"].str.contains("Overall"))]

	plt.figure(figsize=(12, 6))
	colors = sns.color_palette("Set2", len(df["Model"].unique())) # Consistent palette

	ax = sns.barplot(
	data=df, x="Class", y=metric, hue="Model", palette=colors, edgecolor="black", alpha=0.85
	)

	plt.title(f"{metric} for {category} by Model", fontsize=14, fontweight="bold")
	plt.ylim(0, 1)
	plt.xticks(rotation=45, fontsize=12)
	plt.yticks(fontsize=12)
	plt.xlabel("Class", fontsize=12)
	plt.ylabel(metric, fontsize=12)
	plt.legend(title="Model", fontsize=11, loc="upper left")
	plt.grid(axis="y", linestyle="--", alpha=0.6)

	plt.tight_layout()
	plt.show()

	def plot_precision_recall_per_class(self, class_name=None):
	"""
	Creates a grouped bar chart per class, displaying precision & recall side by side for all models.
	Ensures a consistent design with plot_per_class_comparison and plot_category_comparison.

	:param class_name: (str) If provided, only this class will be plotted. If None, all classes will be plotted.
	"""
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np

	sns.set_style("whitegrid")

	# Determine which classes to plot
	if class_name:
	unique_classes = [class_name]
	else:
	unique_classes = self.combined_df["Class"].unique()

	models = self.combined_df["Model"].unique()
	num_models = len(models)

	bar_width = 0.35 # Standardized width for better readability
	spacing = 0 # No extra spacing to match other plots

	colors = sns.color_palette("Set2", num_models) # Consistent color palette

	for class_name in unique_classes:
	df_class = self.combined_df[self.combined_df["Class"] == class_name]

	if df_class.empty:
	print(f"No data available for class: {class_name}")
	continue

	plt.figure(figsize=(12, 6))

	metrics = ["Precision", "Recall"]
	x_indices = np.arange(len(metrics)) # X positions for metrics

	for i, model in enumerate(models):
	df_model = df_class[df_class["Model"] == model]

	if df_model.empty:
	continue

	precision = df_model["Precision"].values[0]
	recall = df_model["Recall"].values[0]

	# Plot bars for Precision and Recall with consistent style
	plt.bar(
	x_indices + (i * bar_width), # No spacing, perfectly aligned
	[precision, recall],
	width=bar_width,
	label=model,
	color=colors[i],
	alpha=0.85,
	edgecolor="black" # Matches the other plots
	)

	plt.xlabel("Metric", fontsize=12)
	plt.ylabel("Score", fontsize=12)
	plt.title(f"Precision & Recall for Class: {class_name}", fontsize=14, fontweight="bold")

	# Adjust x-tick positions to align properly
	plt.xticks(x_indices + ((bar_width * (num_models - 1)) / 2), metrics, fontsize=12)

	plt.ylim(0, 1)
	plt.legend(title="Model", fontsize=11, loc="upper left")
	plt.grid(axis="y", linestyle="--", alpha=0.6)

	plt.tight_layout()
	plt.show()

	def plot_recall_trends(self, selected_models=None):
	"""
	Plot recall trends per class across different models, sorted by recall values in descending order.

	:param selected_models: List of model names to include in the plot. If None, all models in the dataset will be used.
	"""
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np

	sns.set_style("whitegrid")

	# If no specific models are provided, use all available models in the dataset
	if selected_models is None:
	selected_models = self.combined_df["Model"].unique().tolist()

	# Filter dataset to include only selected models
	df_filtered = self.combined_df[self.combined_df["Model"].isin(selected_models)]
	df_filtered_no_overall = df_filtered[~df_filtered["Class"].str.contains("Overall")]

	# Sort by recall values in descending order
	df_sorted = df_filtered_no_overall.sort_values(by="Recall", ascending=False)

	plt.figure(figsize=(12, 6))
	unique_classes = df_sorted["Class"].unique()

	# Define colors for models
	colors = dict(zip(selected_models, sns.color_palette("Set2", len(selected_models))))

	# Connect corresponding classes across models with thin lines (drawn first)
	for class_name in unique_classes:
	class_data = df_sorted[df_sorted["Class"] == class_name]
	if len(class_data) > 1:
	plt.plot(
	class_data["Class"], class_data["Recall"],
	linestyle="-", alpha=0.5, color="gray", linewidth=1.5, zorder=1
	)

	# Plot scatter points after lines to ensure they are on top
	for model in selected_models:
	model_data = df_sorted[df_sorted["Model"] == model]
	plt.scatter(
	model_data["Class"], model_data["Recall"],
	label=model, color=colors[model], edgecolor="black", s=120, alpha=1.0, zorder=2
	)

	plt.xlabel("Class", fontsize=12)
	plt.ylabel("Recall", fontsize=12)
	plt.xticks(rotation=45, ha="right", fontsize=12)
	plt.yticks(fontsize=12)
	plt.title("Recall per Class for Selected Models (Sorted by Recall)", fontsize=14, fontweight="bold")

	# Move legend to the right
	plt.legend(title="Model", fontsize=11, loc="upper right", bbox_to_anchor=(1.15, 1))

	plt.grid(axis="y", linestyle="--", alpha=0.6)

	plt.tight_layout()
	plt.show()

	def plot_metric(self, metric_name, figsize=(10, None), bar_height=0.8, palette="Set2", bar_spacing=0):
	"""
	Creates a hierarchical visualization of metrics with category headers,
	sorted by category-average descending. Ensures slight separation between model bars.
	"""
	colors = sns.color_palette(palette, len(self.evaluators))
	models = list(self.combined_df["Model"].unique())

	df = self.combined_df.copy()
	df = df.drop_duplicates(subset=['Category', 'Class', 'Model', metric_name])

	# Calculate average support per class
	avg_support = df.groupby(['Category', 'Class'])['Support'].mean().round().astype(int)

	# Function to safely retrieve metric values
	def safe_get_value(model, category, class_name):
	mask = (
	(df['Model'] == model) &
	(df['Category'] == category) &
	(df['Class'] == class_name)
	)
	values = df.loc[mask, metric_name]
	return values.iloc[0] if not values.empty else np.nan

	# Calculate category averages, excluding 'Global', and sort descending
	df_no_global = df[df['Category'] != 'Global']
	cat_avgs = df_no_global.groupby('Category', observed=False)[metric_name].mean()
	cat_avgs = cat_avgs.sort_values(ascending=False)
	categories_ordered = list(cat_avgs.index)

	if 'Global' in df['Category'].unique():
	categories_ordered.append('Global')

	plot_data = []
	yticks = []
	ylabels = []
	y_pos = 0
	category_positions = {}

	# Process each category and its classes
	for category in categories_ordered:
	if category == 'Global':
	continue

	category_data = df[df['Category'] == category]
	overall_class_name = f"Overall ({category})"
	mask_overall = category_data['Class'] == overall_class_name
	category_data_overall = category_data[mask_overall]
	category_data_regular = category_data[~mask_overall]

	if not category_data_regular.empty:
	class_means = category_data_regular.groupby('Class')[metric_name].mean()
	class_means = class_means.sort_values(ascending=False)
	sorted_regular_classes = list(class_means.index)
	else:
	sorted_regular_classes = []

	# Add category header
	category_start = y_pos
	yticks.append(y_pos)
	ylabels.append(category.upper())
	y_pos += 1

	# Add regular classes
	for class_name in sorted_regular_classes:
	values = {model: safe_get_value(model, category, class_name) for model in models}
	if any(not np.isnan(v) for v in values.values()):
	plot_data.append({
	'category': category,
	'label': class_name,
	'y_pos': y_pos,
	'values': values,
	'is_category': False
	})
	support = avg_support.get((category, class_name), 0)
	yticks.append(y_pos)
	ylabels.append(f" {class_name} (n={support:,})")
	y_pos += 1

	# Add overall class if exists
	if not category_data_overall.empty:
	values = {model: safe_get_value(model, category, overall_class_name) for model in models}
	if any(not np.isnan(v) for v in values.values()):
	plot_data.append({
	'category': category,
	'label': overall_class_name,
	'y_pos': y_pos,
	'values': values,
	'is_category': False
	})
	support = avg_support.get((category, overall_class_name), 0)
	yticks.append(y_pos)
	ylabels.append(f" {overall_class_name} (n={support:,})")
	y_pos += 1

	category_positions[category] = {
	'start': category_start,
	'end': y_pos - 1
	}

	y_pos += 0.5 # Spacing between categories

	# Calculate dynamic figure height based on number of items
	total_items = len(plot_data) + len(categories_ordered)
	dynamic_height = max(6, total_items * 0.4)
	if figsize[1] is None:
	figsize = (figsize[0], dynamic_height)

	# Plot the bars
	bar_width = bar_height / len(models) # No extra spacing

	fig, ax = plt.subplots(figsize=figsize)

	for category in categories_ordered:
	if category == 'Global':
	continue
	cat_start = category_positions[category]['start'] - 0.4
	cat_end = category_positions[category]['end'] + 0.4
	ax.axhspan(cat_start, cat_end, color='lightgray', alpha=0.2, zorder=0)

	for i, (model, color) in enumerate(zip(models, colors)):
	positions = []
	values = []
	for item in plot_data:
	if not item.get('is_category', False):
	positions.append(item['y_pos'] + (i - len(models)/2) * bar_width)
	values.append(item['values'].get(model, np.nan))

	ax.barh(
	positions, values, height=bar_width,
	label=model, color=color, alpha=0.85, edgecolor="black"
	)

	# Main title
	ax.set_title(f'{metric_name} Comparison Across Models', fontsize=16, fontweight='bold', pad=20)

	# Adjust axis labels and formatting
	ax.set_yticks(yticks)
	ax.set_yticklabels(ylabels, fontsize=10)
	ax.set_xlabel(metric_name, fontsize=12)
	ax.grid(True, axis='x', linestyle="--", alpha=0.7)

	# Invert y-axis to align properly
	ax.invert_yaxis()
	plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc='upper left')

	# Adjust layout with tighter margins
	plt.subplots_adjust(left=0.25, right=0.8, top=0.95, bottom=0.1)
	plt.tight_layout()

	return fig

	def plot_precision_recall_for_category(self, category, palette="Set2"):
	"""
	Creates a modernized Precision-Recall scatter plot for each class within a given category.
	"""
	import matplotlib.pyplot as plt
	import seaborn as sns
	import math
	import numpy as np

	# Set modern style
	plt.rcParams['font.size'] = 12

	# Filter data for the selected category
	df = self.combined_df[self.combined_df["Category"] == category].copy()
	if df.empty:
	print(f"No data available for category: {category}")
	return None

	# Remove overall category-level rows
	class_data = df[~df["Class"].str.contains("Overall")]

	# Get unique models and classes
	models = df["Model"].unique()
	colors = dict(zip(models, sns.color_palette(palette, len(models))))
	classes = sorted(class_data["Class"].unique())

	# Determine grid size
	cols = 2
	rows = math.ceil(len(classes) / cols)

	# Create figure with adjusted size
	fig, axes = plt.subplots(rows, cols, figsize=(16, rows * 6))

	# Set global title with better spacing
	fig.suptitle(f'Precision-Recall Analysis for {category}',
	fontsize=20, fontweight='bold', y=1.02)

	# Iterate over classes and create scatter plots
	for i, class_name in enumerate(classes):
	row, col = divmod(i, cols)
	ax = axes[row, col] if rows > 1 else axes[col] # Ensure indexing works for 1-row cases

	# Create scatter plot
	class_subset = class_data[class_data["Class"] == class_name]
	sns.scatterplot(
	data=class_subset,
	x="Precision",
	y="Recall",
	hue="Model",
	palette=colors,
	ax=ax,
	s=200,
	alpha=0.85,
	edgecolor="black"
	)

	# Add labels with lines for each point
	for idx, row in class_subset.iterrows():
	ax.annotate(
	row["Model"],
	(row["Precision"], row["Recall"]),
	xytext=(8, 8), textcoords='offset points', # Adjusted to reduce overlap
	bbox=dict(facecolor='white', alpha=0.7),
	arrowprops=dict(
	arrowstyle='->',
	connectionstyle='arc3,rad=0.2',
	color='black'
	)
	)

	ax.set_title(f"Class: {class_name}", fontsize=16, fontweight="bold", pad=20)
	ax.set_xlim(-0.05, 1.05)
	ax.set_ylim(-0.05, 1.05)
	ax.grid(True, linestyle="--", alpha=0.5)
	ax.set_aspect("equal", adjustable="box")

	# Remove legend as we now have direct labels
	ax.get_legend().remove()

	# Add labels
	ax.set_xlabel("Precision", fontsize=14)
	ax.set_ylabel("Recall", fontsize=14)

	# Remove empty subplots if classes < grid size
	for j in range(i + 1, rows * cols):
	fig.delaxes(axes.flatten()[j])

	# Adjust layout with better spacing
	fig.subplots_adjust(top=0.92, bottom=0.08, left=0.08, right=0.92, hspace=0.35, wspace=0.3)

	return fig

	def plot_normalized_radar_chart(self, metric_name="F1 Score", exclude_categories=None, figsize=(12, 10), palette="Set2"):
	"""
	Create a normalized radar chart comparing performance across different categories.
	Each vertex is normalized independently based on its maximum value.
	"""
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from matplotlib.patches import Circle

	sns.set_style("whitegrid")

	# Copy data and filter exclusions
	df = self.combined_df.copy()
	if exclude_categories:
	df = df[~df["Category"].isin(exclude_categories)]

	# Get unique categories and models
	categories = sorted(df["Category"].unique())
	models = sorted(df["Model"].unique())

	# Define colors for models
	colors = dict(zip(models, sns.color_palette(palette, len(models))))

	# Create figure
	fig = plt.figure(figsize=figsize)
	ax = plt.subplot(111, polar=True)

	# Add subtle background circles
	for radius in np.linspace(0, 1, 5):
	circle = Circle((0, 0), radius, transform=ax.transData._b,
	fill=True, color='gray', alpha=0.03)
	ax.add_artist(circle)

	# Number of categories and angles
	N = len(categories)
	angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
	angles += angles[:1] # Close the circle

	# Get max values for each category
	df_overall = df[df["Class"].str.contains("Overall")]
	max_values = df_overall.groupby("Category")[metric_name].max().to_dict()

	# Store normalized values for all models
	normalized_values = {}

	# Normalize values for each model
	for model in models:
	values = []
	for cat in categories:
	val = df_overall[(df_overall["Model"] == model) &
	(df_overall["Category"] == cat)][metric_name].values
	val = val[0] if len(val) > 0 else 0
	norm_val = val / max_values[cat] if max_values[cat] > 0 else 0
	values.append(norm_val)
	normalized_values[model] = values + [values[0]]

	# Plot each model with improved styling
	for model, values in normalized_values.items():
	color = colors[model]

	# Add filled area with gradient
	ax.fill(angles, values, color=color, alpha=0.15,
	edgecolor=color, linewidth=0.5)

	# Add main line
	ax.plot(angles, values,
	linewidth=2.5, linestyle='solid',
	label=model, color=color, alpha=0.85,
	zorder=5)

	# Adjust axis settings
	ax.set_theta_offset(np.pi / 2)
	ax.set_theta_direction(-1)
	ax.set_yticklabels([])

	# Draw category labels
	ax.set_thetagrids(np.degrees(angles[:-1]), categories,
	fontsize=12, fontweight="bold")

	# Add scale labels with improved positioning
	for i, (category, angle) in enumerate(zip(categories, angles[:-1])):
	max_val = max_values[category]
	scales = np.linspace(0, max_val, 5)

	for j, scale in enumerate(scales):
	radius = j/4

	# Skip zero as we'll add it separately in the center
	if radius > 0:
	ha = 'center'
	va = 'center'

	ax.text(angle, radius, f'{scale:.2f}',
	ha=ha, va=va,
	color='gray', fontsize=9, fontweight='bold')

	# Add centered zero
	ax.text(0, 0, '0.00',
	ha='center', va='center',
	color='gray', fontsize=9, fontweight='bold')

	# Customize grid with softer lines
	ax.grid(True, color='gray', alpha=0.3, linewidth=0.5)
	ax.yaxis.grid(True, color='gray', alpha=0.3, linewidth=0.5)
	ax.set_rticks(np.linspace(0, 1, 5))

	# Add a subtle background color to the entire plot
	ax.set_facecolor('#f8f9fa')

	# Set title and legend
	plt.title(f'Model Performance Radar Chart - {metric_name}',
	pad=20, fontsize=14, fontweight="bold")
	plt.legend(title="Model", fontsize=11,
	loc="upper right", bbox_to_anchor=(1.3, 1))

	# Adjust aspect ratio
	ax.set_aspect('equal')

	plt.tight_layout()
	return fig

	def transform_to_leaderboard(self):
	# Remove class-level rows, keeping only category-level rows
	df = self.combined_df.copy()
	df = df[~df['Class'].str.contains('Overall', na=False)]

	# Pivot the table so that each category gets its own set of columns
	pivoted_df = df.pivot_table(
	index='Model',
	columns='Category',
	values=['Precision', 'Recall', 'F1 Score', 'Accuracy'],
	aggfunc='mean' # Take mean in case of multiple entries
	)

	# Flatten the multi-level columns
	pivoted_df.columns = ['_'.join(col).strip() for col in pivoted_df.columns.values]

	# Calculate the average F1 score across all categories for ranking
	pivoted_df['Average F1 Score'] = pivoted_df.filter(like='F1 Score').mean(axis=1)

	# Move 'Average F1 Score' to be the first column after 'Model'
	pivoted_df = pivoted_df.reset_index()
	cols = ['Model', 'Average F1 Score'] + [col for col in pivoted_df.columns if col not in ['Model', 'Average F1 Score']]
	pivoted_df = pivoted_df[cols]

	# Rank models based on their average F1 Score
	pivoted_df = pivoted_df.sort_values(by='Average F1 Score', ascending=False).reset_index(drop=True)
	pivoted_df.insert(0, 'Rank', range(1, len(pivoted_df) + 1))

	return pivoted_df