nexar-driving-leaderboard / comparison.py
Roni Goldshmidt
Initial leaderboard setup
436a7cd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="y_pred contains classes not in y_true")
sns.set_style("whitegrid")
class ModelEvaluator:
def __init__(self, df_labels, df_predictions, model_name, categories = ['main-event', 'location', 'zone', 'light-conditions', 'weather-conditions', 'vehicles-density']):
"""
Initialize the evaluator with ground truth labels and model predictions.
"""
self.df_labels = df_labels
self.df_predictions = df_predictions
self.model_name = model_name
self.categories = categories
self.metrics_df = self.compute_metrics()
def merge_data(self):
"""Merge ground truth labels with predictions based on 'id'."""
merged_df = pd.merge(self.df_labels, self.df_predictions, on='id', suffixes=('_true', '_pred'))
for category in list(set(self.categories) - set(['main-event'])):
valid_values = self.df_labels[f"{category}"].unique().astype(str)
merged_df = merged_df[merged_df[f"{category}_pred"].astype(str).isin(valid_values)]
return merged_df
def compute_metrics(self):
"""Compute precision, recall, F1-score, accuracy, and balanced accuracy for each class and category."""
merged_df = self.merge_data()
categories = self.categories
results = []
for category in categories:
true_col = f"{category}_true"
pred_col = f"{category}_pred"
if true_col not in merged_df.columns or pred_col not in merged_df.columns:
print(f"Skipping {category} - missing columns")
continue
filtered_df = merged_df[merged_df[true_col] != "unknown"]
if filtered_df.empty:
print(f"Skipping {category} - only 'unknown' values present.")
continue
y_true = filtered_df[true_col].astype(str)
y_pred = filtered_df[pred_col].astype(str)
valid_labels = sorted(set(y_true) | set(y_pred))
valid_labels = [label for label in valid_labels if (y_true == label).sum() > 0 and label != "unknown"]
if not valid_labels:
print(f"Skipping {category} - No valid labels found after filtering.")
continue
class_precisions = precision_score(y_true, y_pred, labels=valid_labels, average=None, zero_division=0)
class_recalls = recall_score(y_true, y_pred, labels=valid_labels, average=None, zero_division=0)
class_f1 = f1_score(y_true, y_pred, labels=valid_labels, average=None, zero_division=0)
overall_precision = precision_score(y_true, y_pred, labels=valid_labels, average='macro', zero_division=0)
overall_recall = recall_score(y_true, y_pred, labels=valid_labels, average='macro', zero_division=0)
overall_f1 = f1_score(y_true, y_pred, labels=valid_labels, average='macro', zero_division=0)
overall_accuracy = accuracy_score(y_true, y_pred)
overall_balanced_acc = balanced_accuracy_score(y_true, y_pred)
for i, label in enumerate(valid_labels):
results.append({
"Model": self.model_name,
"Category": category,
"Class": label,
"Precision": class_precisions[i],
"Recall": class_recalls[i],
"F1 Score": class_f1[i],
"Accuracy": np.nan,
"Balanced Acc.": np.nan,
"Support": (y_true == label).sum()
})
results.append({
"Model": self.model_name,
"Category": category,
"Class": f"Overall ({category})",
"Precision": overall_precision,
"Recall": overall_recall,
"F1 Score": overall_f1,
"Accuracy": overall_accuracy,
"Balanced Acc.": overall_balanced_acc,
"Support": len(y_true)
})
df_res = pd.DataFrame(results)
return df_res.loc[df_res['Support'] > 0].reset_index(drop=True)
def get_metrics_df(self):
"""Return the computed metrics DataFrame."""
return self.metrics_df
class ModelComparison:
def __init__(self, evaluators):
"""
Compare multiple models based on their evaluation results.
:param evaluators: List of ModelEvaluator instances.
"""
self.evaluators = evaluators
self.combined_df = self.aggregate_metrics()
def aggregate_metrics(self):
"""Merge evaluation metrics from multiple models into a single DataFrame."""
dfs = [evaluator.get_metrics_df() for evaluator in self.evaluators]
return pd.concat(dfs, ignore_index=True)
def plot_category_comparison(self, metric="F1 Score"):
"""Compare models at the category level using a grouped bar chart with consistent styling."""
df = self.combined_df[self.combined_df['Class'].str.contains("Overall")]
plt.figure(figsize=(12, 6))
colors = sns.color_palette("Set2", len(df["Model"].unique())) # Consistent palette
ax = sns.barplot(
data=df, x="Category", y=metric, hue="Model", palette=colors, edgecolor="black", alpha=0.85
)
plt.title(f"{metric} Comparison Across Categories", fontsize=14, fontweight="bold")
plt.ylim(0, 1)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Category", fontsize=12)
plt.ylabel(metric, fontsize=12)
plt.legend(title="Model", fontsize=11, loc="upper left")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()
def plot_per_class_comparison(self, category, metric="F1 Score"):
"""Compare models for a specific category across individual classes with a standardized design."""
df = self.combined_df[(self.combined_df["Category"] == category) & (~self.combined_df["Class"].str.contains("Overall"))]
plt.figure(figsize=(12, 6))
colors = sns.color_palette("Set2", len(df["Model"].unique())) # Consistent palette
ax = sns.barplot(
data=df, x="Class", y=metric, hue="Model", palette=colors, edgecolor="black", alpha=0.85
)
plt.title(f"{metric} for {category} by Model", fontsize=14, fontweight="bold")
plt.ylim(0, 1)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Class", fontsize=12)
plt.ylabel(metric, fontsize=12)
plt.legend(title="Model", fontsize=11, loc="upper left")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()
def plot_precision_recall_per_class(self, class_name=None):
"""
Creates a grouped bar chart per class, displaying precision & recall side by side for all models.
Ensures a consistent design with plot_per_class_comparison and plot_category_comparison.
:param class_name: (str) If provided, only this class will be plotted. If None, all classes will be plotted.
"""
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style("whitegrid")
# Determine which classes to plot
if class_name:
unique_classes = [class_name]
else:
unique_classes = self.combined_df["Class"].unique()
models = self.combined_df["Model"].unique()
num_models = len(models)
bar_width = 0.35 # Standardized width for better readability
spacing = 0 # No extra spacing to match other plots
colors = sns.color_palette("Set2", num_models) # Consistent color palette
for class_name in unique_classes:
df_class = self.combined_df[self.combined_df["Class"] == class_name]
if df_class.empty:
print(f"No data available for class: {class_name}")
continue
plt.figure(figsize=(12, 6))
metrics = ["Precision", "Recall"]
x_indices = np.arange(len(metrics)) # X positions for metrics
for i, model in enumerate(models):
df_model = df_class[df_class["Model"] == model]
if df_model.empty:
continue
precision = df_model["Precision"].values[0]
recall = df_model["Recall"].values[0]
# Plot bars for Precision and Recall with consistent style
plt.bar(
x_indices + (i * bar_width), # No spacing, perfectly aligned
[precision, recall],
width=bar_width,
label=model,
color=colors[i],
alpha=0.85,
edgecolor="black" # Matches the other plots
)
plt.xlabel("Metric", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.title(f"Precision & Recall for Class: {class_name}", fontsize=14, fontweight="bold")
# Adjust x-tick positions to align properly
plt.xticks(x_indices + ((bar_width * (num_models - 1)) / 2), metrics, fontsize=12)
plt.ylim(0, 1)
plt.legend(title="Model", fontsize=11, loc="upper left")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()
def plot_recall_trends(self, selected_models=None):
"""
Plot recall trends per class across different models, sorted by recall values in descending order.
:param selected_models: List of model names to include in the plot. If None, all models in the dataset will be used.
"""
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style("whitegrid")
# If no specific models are provided, use all available models in the dataset
if selected_models is None:
selected_models = self.combined_df["Model"].unique().tolist()
# Filter dataset to include only selected models
df_filtered = self.combined_df[self.combined_df["Model"].isin(selected_models)]
df_filtered_no_overall = df_filtered[~df_filtered["Class"].str.contains("Overall")]
# Sort by recall values in descending order
df_sorted = df_filtered_no_overall.sort_values(by="Recall", ascending=False)
plt.figure(figsize=(12, 6))
unique_classes = df_sorted["Class"].unique()
# Define colors for models
colors = dict(zip(selected_models, sns.color_palette("Set2", len(selected_models))))
# Connect corresponding classes across models with thin lines (drawn first)
for class_name in unique_classes:
class_data = df_sorted[df_sorted["Class"] == class_name]
if len(class_data) > 1:
plt.plot(
class_data["Class"], class_data["Recall"],
linestyle="-", alpha=0.5, color="gray", linewidth=1.5, zorder=1
)
# Plot scatter points **after** lines to ensure they are on top
for model in selected_models:
model_data = df_sorted[df_sorted["Model"] == model]
plt.scatter(
model_data["Class"], model_data["Recall"],
label=model, color=colors[model], edgecolor="black", s=120, alpha=1.0, zorder=2
)
plt.xlabel("Class", fontsize=12)
plt.ylabel("Recall", fontsize=12)
plt.xticks(rotation=45, ha="right", fontsize=12)
plt.yticks(fontsize=12)
plt.title("Recall per Class for Selected Models (Sorted by Recall)", fontsize=14, fontweight="bold")
# Move legend to the right
plt.legend(title="Model", fontsize=11, loc="upper right", bbox_to_anchor=(1.15, 1))
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()
def plot_metric(self, metric_name, figsize=(10, None), bar_height=0.8, palette="Set2", bar_spacing=0):
"""
Creates a hierarchical visualization of metrics with category headers,
sorted by category-average descending. Ensures slight separation between model bars.
"""
colors = sns.color_palette(palette, len(self.evaluators))
models = list(self.combined_df["Model"].unique())
df = self.combined_df.copy()
df = df.drop_duplicates(subset=['Category', 'Class', 'Model', metric_name])
# Calculate average support per class
avg_support = df.groupby(['Category', 'Class'])['Support'].mean().round().astype(int)
# Function to safely retrieve metric values
def safe_get_value(model, category, class_name):
mask = (
(df['Model'] == model) &
(df['Category'] == category) &
(df['Class'] == class_name)
)
values = df.loc[mask, metric_name]
return values.iloc[0] if not values.empty else np.nan
# Calculate category averages, excluding 'Global', and sort descending
df_no_global = df[df['Category'] != 'Global']
cat_avgs = df_no_global.groupby('Category', observed=False)[metric_name].mean()
cat_avgs = cat_avgs.sort_values(ascending=False)
categories_ordered = list(cat_avgs.index)
if 'Global' in df['Category'].unique():
categories_ordered.append('Global')
plot_data = []
yticks = []
ylabels = []
y_pos = 0
category_positions = {}
# Process each category and its classes
for category in categories_ordered:
if category == 'Global':
continue
category_data = df[df['Category'] == category]
overall_class_name = f"Overall ({category})"
mask_overall = category_data['Class'] == overall_class_name
category_data_overall = category_data[mask_overall]
category_data_regular = category_data[~mask_overall]
if not category_data_regular.empty:
class_means = category_data_regular.groupby('Class')[metric_name].mean()
class_means = class_means.sort_values(ascending=False)
sorted_regular_classes = list(class_means.index)
else:
sorted_regular_classes = []
# Add category header
category_start = y_pos
yticks.append(y_pos)
ylabels.append(category.upper())
y_pos += 1
# Add regular classes
for class_name in sorted_regular_classes:
values = {model: safe_get_value(model, category, class_name) for model in models}
if any(not np.isnan(v) for v in values.values()):
plot_data.append({
'category': category,
'label': class_name,
'y_pos': y_pos,
'values': values,
'is_category': False
})
support = avg_support.get((category, class_name), 0)
yticks.append(y_pos)
ylabels.append(f" {class_name} (n={support:,})")
y_pos += 1
# Add overall class if exists
if not category_data_overall.empty:
values = {model: safe_get_value(model, category, overall_class_name) for model in models}
if any(not np.isnan(v) for v in values.values()):
plot_data.append({
'category': category,
'label': overall_class_name,
'y_pos': y_pos,
'values': values,
'is_category': False
})
support = avg_support.get((category, overall_class_name), 0)
yticks.append(y_pos)
ylabels.append(f" {overall_class_name} (n={support:,})")
y_pos += 1
category_positions[category] = {
'start': category_start,
'end': y_pos - 1
}
y_pos += 0.5 # Spacing between categories
# Calculate dynamic figure height based on number of items
total_items = len(plot_data) + len(categories_ordered)
dynamic_height = max(6, total_items * 0.4)
if figsize[1] is None:
figsize = (figsize[0], dynamic_height)
# Plot the bars
bar_width = bar_height / len(models) # No extra spacing
fig, ax = plt.subplots(figsize=figsize)
for category in categories_ordered:
if category == 'Global':
continue
cat_start = category_positions[category]['start'] - 0.4
cat_end = category_positions[category]['end'] + 0.4
ax.axhspan(cat_start, cat_end, color='lightgray', alpha=0.2, zorder=0)
for i, (model, color) in enumerate(zip(models, colors)):
positions = []
values = []
for item in plot_data:
if not item.get('is_category', False):
positions.append(item['y_pos'] + (i - len(models)/2) * bar_width)
values.append(item['values'].get(model, np.nan))
ax.barh(
positions, values, height=bar_width,
label=model, color=color, alpha=0.85, edgecolor="black"
)
# Main title
ax.set_title(f'{metric_name} Comparison Across Models', fontsize=16, fontweight='bold', pad=20)
# Adjust axis labels and formatting
ax.set_yticks(yticks)
ax.set_yticklabels(ylabels, fontsize=10)
ax.set_xlabel(metric_name, fontsize=12)
ax.grid(True, axis='x', linestyle="--", alpha=0.7)
# Invert y-axis to align properly
ax.invert_yaxis()
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc='upper left')
# Adjust layout with tighter margins
plt.subplots_adjust(left=0.25, right=0.8, top=0.95, bottom=0.1)
plt.tight_layout()
return fig
def plot_precision_recall_for_category(self, category, palette="Set2"):
"""
Creates a modernized Precision-Recall scatter plot for each class within a given category.
"""
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np
# Set modern style
plt.rcParams['font.size'] = 12
# Filter data for the selected category
df = self.combined_df[self.combined_df["Category"] == category].copy()
if df.empty:
print(f"No data available for category: {category}")
return None
# Remove overall category-level rows
class_data = df[~df["Class"].str.contains("Overall")]
# Get unique models and classes
models = df["Model"].unique()
colors = dict(zip(models, sns.color_palette(palette, len(models))))
classes = sorted(class_data["Class"].unique())
# Determine grid size
cols = 2
rows = math.ceil(len(classes) / cols)
# Create figure with adjusted size
fig, axes = plt.subplots(rows, cols, figsize=(16, rows * 6))
# Set global title with better spacing
fig.suptitle(f'Precision-Recall Analysis for {category}',
fontsize=20, fontweight='bold', y=1.02)
# Iterate over classes and create scatter plots
for i, class_name in enumerate(classes):
row, col = divmod(i, cols)
ax = axes[row, col] if rows > 1 else axes[col] # Ensure indexing works for 1-row cases
# Create scatter plot
class_subset = class_data[class_data["Class"] == class_name]
sns.scatterplot(
data=class_subset,
x="Precision",
y="Recall",
hue="Model",
palette=colors,
ax=ax,
s=200,
alpha=0.85,
edgecolor="black"
)
# Add labels with lines for each point
for idx, row in class_subset.iterrows():
ax.annotate(
row["Model"],
(row["Precision"], row["Recall"]),
xytext=(8, 8), textcoords='offset points', # Adjusted to reduce overlap
bbox=dict(facecolor='white', alpha=0.7),
arrowprops=dict(
arrowstyle='->',
connectionstyle='arc3,rad=0.2',
color='black'
)
)
ax.set_title(f"Class: {class_name}", fontsize=16, fontweight="bold", pad=20)
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(-0.05, 1.05)
ax.grid(True, linestyle="--", alpha=0.5)
ax.set_aspect("equal", adjustable="box")
# Remove legend as we now have direct labels
ax.get_legend().remove()
# Add labels
ax.set_xlabel("Precision", fontsize=14)
ax.set_ylabel("Recall", fontsize=14)
# Remove empty subplots if classes < grid size
for j in range(i + 1, rows * cols):
fig.delaxes(axes.flatten()[j])
# Adjust layout with better spacing
fig.subplots_adjust(top=0.92, bottom=0.08, left=0.08, right=0.92, hspace=0.35, wspace=0.3)
return fig
def plot_normalized_radar_chart(self, metric_name="F1 Score", exclude_categories=None, figsize=(12, 10), palette="Set2"):
"""
Create a normalized radar chart comparing performance across different categories.
Each vertex is normalized independently based on its maximum value.
"""
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Circle
sns.set_style("whitegrid")
# Copy data and filter exclusions
df = self.combined_df.copy()
if exclude_categories:
df = df[~df["Category"].isin(exclude_categories)]
# Get unique categories and models
categories = sorted(df["Category"].unique())
models = sorted(df["Model"].unique())
# Define colors for models
colors = dict(zip(models, sns.color_palette(palette, len(models))))
# Create figure
fig = plt.figure(figsize=figsize)
ax = plt.subplot(111, polar=True)
# Add subtle background circles
for radius in np.linspace(0, 1, 5):
circle = Circle((0, 0), radius, transform=ax.transData._b,
fill=True, color='gray', alpha=0.03)
ax.add_artist(circle)
# Number of categories and angles
N = len(categories)
angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
angles += angles[:1] # Close the circle
# Get max values for each category
df_overall = df[df["Class"].str.contains("Overall")]
max_values = df_overall.groupby("Category")[metric_name].max().to_dict()
# Store normalized values for all models
normalized_values = {}
# Normalize values for each model
for model in models:
values = []
for cat in categories:
val = df_overall[(df_overall["Model"] == model) &
(df_overall["Category"] == cat)][metric_name].values
val = val[0] if len(val) > 0 else 0
norm_val = val / max_values[cat] if max_values[cat] > 0 else 0
values.append(norm_val)
normalized_values[model] = values + [values[0]]
# Plot each model with improved styling
for model, values in normalized_values.items():
color = colors[model]
# Add filled area with gradient
ax.fill(angles, values, color=color, alpha=0.15,
edgecolor=color, linewidth=0.5)
# Add main line
ax.plot(angles, values,
linewidth=2.5, linestyle='solid',
label=model, color=color, alpha=0.85,
zorder=5)
# Adjust axis settings
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_yticklabels([])
# Draw category labels
ax.set_thetagrids(np.degrees(angles[:-1]), categories,
fontsize=12, fontweight="bold")
# Add scale labels with improved positioning
for i, (category, angle) in enumerate(zip(categories, angles[:-1])):
max_val = max_values[category]
scales = np.linspace(0, max_val, 5)
for j, scale in enumerate(scales):
radius = j/4
# Skip zero as we'll add it separately in the center
if radius > 0:
ha = 'center'
va = 'center'
ax.text(angle, radius, f'{scale:.2f}',
ha=ha, va=va,
color='gray', fontsize=9, fontweight='bold')
# Add centered zero
ax.text(0, 0, '0.00',
ha='center', va='center',
color='gray', fontsize=9, fontweight='bold')
# Customize grid with softer lines
ax.grid(True, color='gray', alpha=0.3, linewidth=0.5)
ax.yaxis.grid(True, color='gray', alpha=0.3, linewidth=0.5)
ax.set_rticks(np.linspace(0, 1, 5))
# Add a subtle background color to the entire plot
ax.set_facecolor('#f8f9fa')
# Set title and legend
plt.title(f'Model Performance Radar Chart - {metric_name}',
pad=20, fontsize=14, fontweight="bold")
plt.legend(title="Model", fontsize=11,
loc="upper right", bbox_to_anchor=(1.3, 1))
# Adjust aspect ratio
ax.set_aspect('equal')
plt.tight_layout()
return fig
def transform_to_leaderboard(self):
# Remove class-level rows, keeping only category-level rows
df = self.combined_df.copy()
df = df[~df['Class'].str.contains('Overall', na=False)]
# Pivot the table so that each category gets its own set of columns
pivoted_df = df.pivot_table(
index='Model',
columns='Category',
values=['Precision', 'Recall', 'F1 Score', 'Accuracy'],
aggfunc='mean' # Take mean in case of multiple entries
)
# Flatten the multi-level columns
pivoted_df.columns = ['_'.join(col).strip() for col in pivoted_df.columns.values]
# Calculate the average F1 score across all categories for ranking
pivoted_df['Average F1 Score'] = pivoted_df.filter(like='F1 Score').mean(axis=1)
# Move 'Average F1 Score' to be the first column after 'Model'
pivoted_df = pivoted_df.reset_index()
cols = ['Model', 'Average F1 Score'] + [col for col in pivoted_df.columns if col not in ['Model', 'Average F1 Score']]
pivoted_df = pivoted_df[cols]
# Rank models based on their average F1 Score
pivoted_df = pivoted_df.sort_values(by='Average F1 Score', ascending=False).reset_index(drop=True)
pivoted_df.insert(0, 'Rank', range(1, len(pivoted_df) + 1))
return pivoted_df