|
""" |
|
Leaderboard module for Dynamic Highscores system. |
|
|
|
This module implements the unified leaderboard with tag-based filtering |
|
for displaying all evaluated models. |
|
""" |
|
|
|
import os |
|
import json |
|
import pandas as pd |
|
import gradio as gr |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
|
|
class Leaderboard: |
|
"""Manages the unified leaderboard with filtering capabilities.""" |
|
|
|
def __init__(self, db_manager): |
|
"""Initialize the leaderboard manager. |
|
|
|
Args: |
|
db_manager: Database manager instance |
|
""" |
|
self.db_manager = db_manager |
|
self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"] |
|
|
|
|
|
self.tag_colors = { |
|
"Merge": "#FF6B6B", |
|
"Agent": "#4ECDC4", |
|
"Reasoning": "#FFD166", |
|
"Coding": "#6B5B95", |
|
"General": "#88D8B0", |
|
"Specialized": "#FF8C42", |
|
"Instruction": "#5D9CEC", |
|
"Chat": "#AC92EB" |
|
} |
|
|
|
def get_leaderboard_data(self, tag=None, benchmark_id=None): |
|
"""Get leaderboard data, optionally filtered by tag or benchmark. |
|
|
|
Args: |
|
tag: Model tag to filter by (None for all) |
|
benchmark_id: Benchmark ID to filter by (None for all) |
|
|
|
Returns: |
|
pd.DataFrame: Leaderboard data |
|
""" |
|
|
|
if tag and tag != "All": |
|
df = self.db_manager.get_leaderboard_df(tag=tag, benchmark_id=benchmark_id) |
|
else: |
|
df = self.db_manager.get_leaderboard_df(benchmark_id=benchmark_id) |
|
|
|
return df |
|
|
|
def format_leaderboard_for_display(self, df): |
|
"""Format leaderboard data for display. |
|
|
|
Args: |
|
df: Leaderboard DataFrame |
|
|
|
Returns: |
|
pd.DataFrame: Formatted leaderboard for display |
|
""" |
|
if df.empty: |
|
return pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed']) |
|
|
|
|
|
display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy() |
|
display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed'] |
|
|
|
|
|
display_df['Score'] = display_df['Score'].round(2) |
|
|
|
|
|
display_df = display_df.sort_values('Score', ascending=False) |
|
|
|
return display_df |
|
|
|
def create_performance_chart(self, df, chart_type="bar"): |
|
"""Create a performance chart from leaderboard data. |
|
|
|
Args: |
|
df: Leaderboard DataFrame |
|
chart_type: Type of chart to create ("bar" or "scatter") |
|
|
|
Returns: |
|
plotly.graph_objects.Figure: Performance chart |
|
""" |
|
if df.empty: |
|
|
|
fig = go.Figure() |
|
fig.update_layout( |
|
title="No data available", |
|
xaxis_title="Model", |
|
yaxis_title="Score" |
|
) |
|
return fig |
|
|
|
|
|
plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy() |
|
plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score'] |
|
|
|
|
|
if chart_type == "scatter": |
|
fig = px.scatter( |
|
plot_df, |
|
x="Model", |
|
y="Score", |
|
color="Tag", |
|
symbol="Benchmark", |
|
size="Score", |
|
hover_data=["Model", "Benchmark", "Score"], |
|
color_discrete_map=self.tag_colors |
|
) |
|
else: |
|
fig = px.bar( |
|
plot_df, |
|
x="Model", |
|
y="Score", |
|
color="Tag", |
|
barmode="group", |
|
hover_data=["Model", "Benchmark", "Score"], |
|
color_discrete_map=self.tag_colors |
|
) |
|
|
|
|
|
fig.update_layout( |
|
title="Model Performance Comparison", |
|
xaxis_title="Model", |
|
yaxis_title="Score", |
|
legend_title="Tag", |
|
font=dict(size=12) |
|
) |
|
|
|
return fig |
|
|
|
def create_tag_distribution_chart(self, df): |
|
"""Create a chart showing distribution of models by tag. |
|
|
|
Args: |
|
df: Leaderboard DataFrame |
|
|
|
Returns: |
|
plotly.graph_objects.Figure: Tag distribution chart |
|
""" |
|
if df.empty: |
|
|
|
fig = go.Figure() |
|
fig.update_layout( |
|
title="No data available", |
|
xaxis_title="Tag", |
|
yaxis_title="Count" |
|
) |
|
return fig |
|
|
|
|
|
tag_counts = df['tag'].value_counts().reset_index() |
|
tag_counts.columns = ['Tag', 'Count'] |
|
|
|
|
|
fig = px.pie( |
|
tag_counts, |
|
names='Tag', |
|
values='Count', |
|
title='Model Distribution by Tag', |
|
color='Tag', |
|
color_discrete_map=self.tag_colors |
|
) |
|
|
|
|
|
fig.update_layout( |
|
font=dict(size=12) |
|
) |
|
|
|
return fig |
|
|
|
def create_benchmark_comparison_chart(self, df): |
|
"""Create a chart comparing performance across benchmarks. |
|
|
|
Args: |
|
df: Leaderboard DataFrame |
|
|
|
Returns: |
|
plotly.graph_objects.Figure: Benchmark comparison chart |
|
""" |
|
if df.empty: |
|
|
|
fig = go.Figure() |
|
fig.update_layout( |
|
title="No data available", |
|
xaxis_title="Benchmark", |
|
yaxis_title="Average Score" |
|
) |
|
return fig |
|
|
|
|
|
benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index() |
|
benchmark_avg.columns = ['Benchmark', 'Average Score'] |
|
|
|
|
|
fig = px.bar( |
|
benchmark_avg, |
|
x='Benchmark', |
|
y='Average Score', |
|
title='Average Performance by Benchmark', |
|
color='Benchmark' |
|
) |
|
|
|
|
|
fig.update_layout( |
|
xaxis_title="Benchmark", |
|
yaxis_title="Average Score", |
|
font=dict(size=12) |
|
) |
|
|
|
return fig |
|
|
|
|
|
def create_leaderboard_ui(leaderboard, db_manager): |
|
"""Create the leaderboard UI components. |
|
|
|
Args: |
|
leaderboard: Leaderboard instance |
|
db_manager: Database manager instance |
|
|
|
Returns: |
|
gr.Blocks: Gradio Blocks component with leaderboard UI |
|
""" |
|
with gr.Blocks() as leaderboard_ui: |
|
gr.Markdown("# Dynamic Highscores Leaderboard") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
tag_filter = gr.Dropdown( |
|
choices=leaderboard.model_tags, |
|
value="All", |
|
label="Filter by Tag" |
|
) |
|
|
|
benchmark_filter = gr.Dropdown( |
|
choices=[("all", "All Benchmarks")], |
|
value="all", |
|
label="Filter by Benchmark" |
|
) |
|
|
|
refresh_button = gr.Button("Refresh Leaderboard") |
|
|
|
with gr.Column(scale=2): |
|
chart_type = gr.Radio( |
|
choices=["bar", "scatter"], |
|
value="bar", |
|
label="Chart Type" |
|
) |
|
|
|
view_type = gr.Radio( |
|
choices=["Table", "Chart", "Dashboard"], |
|
value="Table", |
|
label="View Type" |
|
) |
|
|
|
|
|
leaderboard_table = gr.Dataframe( |
|
headers=["Model", "Benchmark", "Tag", "Score", "Completed"], |
|
label="Leaderboard", |
|
visible=True |
|
) |
|
|
|
|
|
with gr.Row(visible=False) as chart_view: |
|
performance_chart = gr.Plot(label="Performance Chart") |
|
|
|
|
|
with gr.Row(visible=False) as dashboard_view: |
|
with gr.Column(scale=2): |
|
dashboard_performance_chart = gr.Plot(label="Performance Comparison") |
|
|
|
with gr.Column(scale=1): |
|
with gr.Row(): |
|
tag_distribution_chart = gr.Plot(label="Model Distribution") |
|
|
|
with gr.Row(): |
|
benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison") |
|
|
|
|
|
def refresh_benchmarks(): |
|
try: |
|
benchmarks = db_manager.get_benchmarks() |
|
|
|
|
|
choices = [("all", "All Benchmarks")] |
|
choices.extend([(str(b["id"]), b["name"]) for b in benchmarks]) |
|
|
|
return gr.update(choices=choices) |
|
except Exception as e: |
|
print(f"Error refreshing benchmarks: {e}") |
|
return gr.update(choices=[("all", "All Benchmarks")]) |
|
|
|
def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val): |
|
try: |
|
|
|
if benchmark_id == "all": |
|
benchmark_id = None |
|
|
|
df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id) |
|
|
|
|
|
display_df = leaderboard.format_leaderboard_for_display(df) |
|
|
|
|
|
perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val) |
|
tag_chart = leaderboard.create_tag_distribution_chart(df) |
|
benchmark_chart = leaderboard.create_benchmark_comparison_chart(df) |
|
|
|
|
|
table_visible = view_type_val == "Table" |
|
chart_visible = view_type_val == "Chart" |
|
dashboard_visible = view_type_val == "Dashboard" |
|
|
|
return ( |
|
display_df, |
|
perf_chart, |
|
perf_chart, |
|
tag_chart, |
|
benchmark_chart, |
|
gr.update(visible=table_visible), |
|
gr.update(visible=chart_visible), |
|
gr.update(visible=dashboard_visible) |
|
) |
|
except Exception as e: |
|
print(f"Error updating leaderboard: {e}") |
|
empty_df = pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed']) |
|
empty_chart = go.Figure() |
|
empty_chart.update_layout(title="Error loading data") |
|
|
|
return ( |
|
empty_df, |
|
empty_chart, |
|
empty_chart, |
|
empty_chart, |
|
empty_chart, |
|
gr.update(visible=True), |
|
gr.update(visible=False), |
|
gr.update(visible=False) |
|
) |
|
|
|
|
|
refresh_button.click( |
|
fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t), |
|
inputs=[tag_filter, benchmark_filter, chart_type, view_type], |
|
outputs=[ |
|
leaderboard_table, |
|
performance_chart, |
|
dashboard_performance_chart, |
|
tag_distribution_chart, |
|
benchmark_comparison_chart, |
|
leaderboard_table, |
|
chart_view, |
|
dashboard_view |
|
] |
|
) |
|
|
|
view_type.change( |
|
fn=lambda view_t: ( |
|
gr.update(visible=view_t == "Table"), |
|
gr.update(visible=view_t == "Chart"), |
|
gr.update(visible=view_t == "Dashboard") |
|
), |
|
inputs=[view_type], |
|
outputs=[leaderboard_table, chart_view, dashboard_view] |
|
) |
|
|
|
|
|
leaderboard_ui.load( |
|
fn=refresh_benchmarks, |
|
inputs=[], |
|
outputs=[benchmark_filter] |
|
) |
|
|
|
leaderboard_ui.load( |
|
fn=lambda: update_leaderboard("All", "all", "bar", "Table"), |
|
inputs=[], |
|
outputs=[ |
|
leaderboard_table, |
|
performance_chart, |
|
dashboard_performance_chart, |
|
tag_distribution_chart, |
|
benchmark_comparison_chart, |
|
leaderboard_table, |
|
chart_view, |
|
dashboard_view |
|
] |
|
) |
|
|
|
return leaderboard_ui |