Quazim0t0's picture
Upload 16 files
9a46619 verified
raw
history blame
13.3 kB
"""
Leaderboard module for Dynamic Highscores system.
This module implements the unified leaderboard with tag-based filtering
for displaying all evaluated models.
"""
import os
import json
import pandas as pd
import gradio as gr
import plotly.express as px
import plotly.graph_objects as go
class Leaderboard:
"""Manages the unified leaderboard with filtering capabilities."""
def __init__(self, db_manager):
"""Initialize the leaderboard manager.
Args:
db_manager: Database manager instance
"""
self.db_manager = db_manager
self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
# Define color scheme for tags
self.tag_colors = {
"Merge": "#FF6B6B",
"Agent": "#4ECDC4",
"Reasoning": "#FFD166",
"Coding": "#6B5B95",
"General": "#88D8B0",
"Specialized": "#FF8C42",
"Instruction": "#5D9CEC",
"Chat": "#AC92EB"
}
def get_leaderboard_data(self, tag=None, benchmark_id=None):
"""Get leaderboard data, optionally filtered by tag or benchmark.
Args:
tag: Model tag to filter by (None for all)
benchmark_id: Benchmark ID to filter by (None for all)
Returns:
pd.DataFrame: Leaderboard data
"""
# Get evaluation results from database
if tag and tag != "All":
df = self.db_manager.get_leaderboard_df(tag=tag, benchmark_id=benchmark_id)
else:
df = self.db_manager.get_leaderboard_df(benchmark_id=benchmark_id)
return df
def format_leaderboard_for_display(self, df):
"""Format leaderboard data for display.
Args:
df: Leaderboard DataFrame
Returns:
pd.DataFrame: Formatted leaderboard for display
"""
if df.empty:
return pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
# Select and rename columns for display
display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
# Round score to 2 decimal places
display_df['Score'] = display_df['Score'].round(2)
# Sort by score (descending)
display_df = display_df.sort_values('Score', ascending=False)
return display_df
def create_performance_chart(self, df, chart_type="bar"):
"""Create a performance chart from leaderboard data.
Args:
df: Leaderboard DataFrame
chart_type: Type of chart to create ("bar" or "scatter")
Returns:
plotly.graph_objects.Figure: Performance chart
"""
if df.empty:
# Return empty figure
fig = go.Figure()
fig.update_layout(
title="No data available",
xaxis_title="Model",
yaxis_title="Score"
)
return fig
# Prepare data for visualization
plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
# Create chart based on type
if chart_type == "scatter":
fig = px.scatter(
plot_df,
x="Model",
y="Score",
color="Tag",
symbol="Benchmark",
size="Score",
hover_data=["Model", "Benchmark", "Score"],
color_discrete_map=self.tag_colors
)
else: # Default to bar chart
fig = px.bar(
plot_df,
x="Model",
y="Score",
color="Tag",
barmode="group",
hover_data=["Model", "Benchmark", "Score"],
color_discrete_map=self.tag_colors
)
# Customize layout
fig.update_layout(
title="Model Performance Comparison",
xaxis_title="Model",
yaxis_title="Score",
legend_title="Tag",
font=dict(size=12)
)
return fig
def create_tag_distribution_chart(self, df):
"""Create a chart showing distribution of models by tag.
Args:
df: Leaderboard DataFrame
Returns:
plotly.graph_objects.Figure: Tag distribution chart
"""
if df.empty:
# Return empty figure
fig = go.Figure()
fig.update_layout(
title="No data available",
xaxis_title="Tag",
yaxis_title="Count"
)
return fig
# Count models by tag
tag_counts = df['tag'].value_counts().reset_index()
tag_counts.columns = ['Tag', 'Count']
# Create pie chart
fig = px.pie(
tag_counts,
names='Tag',
values='Count',
title='Model Distribution by Tag',
color='Tag',
color_discrete_map=self.tag_colors
)
# Customize layout
fig.update_layout(
font=dict(size=12)
)
return fig
def create_benchmark_comparison_chart(self, df):
"""Create a chart comparing performance across benchmarks.
Args:
df: Leaderboard DataFrame
Returns:
plotly.graph_objects.Figure: Benchmark comparison chart
"""
if df.empty:
# Return empty figure
fig = go.Figure()
fig.update_layout(
title="No data available",
xaxis_title="Benchmark",
yaxis_title="Average Score"
)
return fig
# Calculate average score by benchmark
benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
benchmark_avg.columns = ['Benchmark', 'Average Score']
# Create bar chart
fig = px.bar(
benchmark_avg,
x='Benchmark',
y='Average Score',
title='Average Performance by Benchmark',
color='Benchmark'
)
# Customize layout
fig.update_layout(
xaxis_title="Benchmark",
yaxis_title="Average Score",
font=dict(size=12)
)
return fig
# Leaderboard UI components
def create_leaderboard_ui(leaderboard, db_manager):
"""Create the leaderboard UI components.
Args:
leaderboard: Leaderboard instance
db_manager: Database manager instance
Returns:
gr.Blocks: Gradio Blocks component with leaderboard UI
"""
with gr.Blocks() as leaderboard_ui:
gr.Markdown("# Dynamic Highscores Leaderboard")
with gr.Row():
with gr.Column(scale=1):
tag_filter = gr.Dropdown(
choices=leaderboard.model_tags,
value="All",
label="Filter by Tag"
)
benchmark_filter = gr.Dropdown(
choices=[("all", "All Benchmarks")],
value="all",
label="Filter by Benchmark"
)
refresh_button = gr.Button("Refresh Leaderboard")
with gr.Column(scale=2):
chart_type = gr.Radio(
choices=["bar", "scatter"],
value="bar",
label="Chart Type"
)
view_type = gr.Radio(
choices=["Table", "Chart", "Dashboard"],
value="Table",
label="View Type"
)
# Table view
leaderboard_table = gr.Dataframe(
headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
label="Leaderboard",
visible=True
)
# Chart view
with gr.Row(visible=False) as chart_view:
performance_chart = gr.Plot(label="Performance Chart")
# Dashboard view
with gr.Row(visible=False) as dashboard_view:
with gr.Column(scale=2):
dashboard_performance_chart = gr.Plot(label="Performance Comparison")
with gr.Column(scale=1):
with gr.Row():
tag_distribution_chart = gr.Plot(label="Model Distribution")
with gr.Row():
benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
# Event handlers
def refresh_benchmarks():
try:
benchmarks = db_manager.get_benchmarks()
# Format for dropdown
choices = [("all", "All Benchmarks")]
choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
return gr.update(choices=choices)
except Exception as e:
print(f"Error refreshing benchmarks: {e}")
return gr.update(choices=[("all", "All Benchmarks")])
def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
try:
# Get leaderboard data
if benchmark_id == "all":
benchmark_id = None
df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
# Format for display
display_df = leaderboard.format_leaderboard_for_display(df)
# Create charts
perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
tag_chart = leaderboard.create_tag_distribution_chart(df)
benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
# Update visibility based on view type
table_visible = view_type_val == "Table"
chart_visible = view_type_val == "Chart"
dashboard_visible = view_type_val == "Dashboard"
return (
display_df,
perf_chart,
perf_chart, # Same chart for both views
tag_chart,
benchmark_chart,
gr.update(visible=table_visible),
gr.update(visible=chart_visible),
gr.update(visible=dashboard_visible)
)
except Exception as e:
print(f"Error updating leaderboard: {e}")
empty_df = pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
empty_chart = go.Figure()
empty_chart.update_layout(title="Error loading data")
return (
empty_df,
empty_chart,
empty_chart,
empty_chart,
empty_chart,
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=False)
)
# Connect event handlers
refresh_button.click(
fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
inputs=[tag_filter, benchmark_filter, chart_type, view_type],
outputs=[
leaderboard_table,
performance_chart,
dashboard_performance_chart,
tag_distribution_chart,
benchmark_comparison_chart,
leaderboard_table,
chart_view,
dashboard_view
]
)
view_type.change(
fn=lambda view_t: (
gr.update(visible=view_t == "Table"),
gr.update(visible=view_t == "Chart"),
gr.update(visible=view_t == "Dashboard")
),
inputs=[view_type],
outputs=[leaderboard_table, chart_view, dashboard_view]
)
# Initialize on load
leaderboard_ui.load(
fn=refresh_benchmarks,
inputs=[],
outputs=[benchmark_filter]
)
leaderboard_ui.load(
fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
inputs=[],
outputs=[
leaderboard_table,
performance_chart,
dashboard_performance_chart,
tag_distribution_chart,
benchmark_comparison_chart,
leaderboard_table,
chart_view,
dashboard_view
]
)
return leaderboard_ui