""" Leaderboard module for Dynamic Highscores system. This module implements the unified leaderboard with tag-based filtering for displaying all evaluated models. """ import os import json import pandas as pd import gradio as gr import plotly.express as px import plotly.graph_objects as go class Leaderboard: """Manages the unified leaderboard with filtering capabilities.""" def __init__(self, db_manager): """Initialize the leaderboard manager. Args: db_manager: Database manager instance """ self.db_manager = db_manager self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"] # Define color scheme for tags self.tag_colors = { "Merge": "#FF6B6B", "Agent": "#4ECDC4", "Reasoning": "#FFD166", "Coding": "#6B5B95", "General": "#88D8B0", "Specialized": "#FF8C42", "Instruction": "#5D9CEC", "Chat": "#AC92EB" } def get_leaderboard_data(self, tag=None, benchmark_id=None): """Get leaderboard data, optionally filtered by tag or benchmark. Args: tag: Model tag to filter by (None for all) benchmark_id: Benchmark ID to filter by (None for all) Returns: pd.DataFrame: Leaderboard data """ # Get evaluation results from database if tag and tag != "All": df = self.db_manager.get_leaderboard_df(tag=tag, benchmark_id=benchmark_id) else: df = self.db_manager.get_leaderboard_df(benchmark_id=benchmark_id) return df def format_leaderboard_for_display(self, df): """Format leaderboard data for display. Args: df: Leaderboard DataFrame Returns: pd.DataFrame: Formatted leaderboard for display """ if df.empty: return pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed']) # Select and rename columns for display display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy() display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed'] # Round score to 2 decimal places display_df['Score'] = display_df['Score'].round(2) # Sort by score (descending) display_df = display_df.sort_values('Score', ascending=False) return display_df def create_performance_chart(self, df, chart_type="bar"): """Create a performance chart from leaderboard data. Args: df: Leaderboard DataFrame chart_type: Type of chart to create ("bar" or "scatter") Returns: plotly.graph_objects.Figure: Performance chart """ if df.empty: # Return empty figure fig = go.Figure() fig.update_layout( title="No data available", xaxis_title="Model", yaxis_title="Score" ) return fig # Prepare data for visualization plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy() plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score'] # Create chart based on type if chart_type == "scatter": fig = px.scatter( plot_df, x="Model", y="Score", color="Tag", symbol="Benchmark", size="Score", hover_data=["Model", "Benchmark", "Score"], color_discrete_map=self.tag_colors ) else: # Default to bar chart fig = px.bar( plot_df, x="Model", y="Score", color="Tag", barmode="group", hover_data=["Model", "Benchmark", "Score"], color_discrete_map=self.tag_colors ) # Customize layout fig.update_layout( title="Model Performance Comparison", xaxis_title="Model", yaxis_title="Score", legend_title="Tag", font=dict(size=12) ) return fig def create_tag_distribution_chart(self, df): """Create a chart showing distribution of models by tag. Args: df: Leaderboard DataFrame Returns: plotly.graph_objects.Figure: Tag distribution chart """ if df.empty: # Return empty figure fig = go.Figure() fig.update_layout( title="No data available", xaxis_title="Tag", yaxis_title="Count" ) return fig # Count models by tag tag_counts = df['tag'].value_counts().reset_index() tag_counts.columns = ['Tag', 'Count'] # Create pie chart fig = px.pie( tag_counts, names='Tag', values='Count', title='Model Distribution by Tag', color='Tag', color_discrete_map=self.tag_colors ) # Customize layout fig.update_layout( font=dict(size=12) ) return fig def create_benchmark_comparison_chart(self, df): """Create a chart comparing performance across benchmarks. Args: df: Leaderboard DataFrame Returns: plotly.graph_objects.Figure: Benchmark comparison chart """ if df.empty: # Return empty figure fig = go.Figure() fig.update_layout( title="No data available", xaxis_title="Benchmark", yaxis_title="Average Score" ) return fig # Calculate average score by benchmark benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index() benchmark_avg.columns = ['Benchmark', 'Average Score'] # Create bar chart fig = px.bar( benchmark_avg, x='Benchmark', y='Average Score', title='Average Performance by Benchmark', color='Benchmark' ) # Customize layout fig.update_layout( xaxis_title="Benchmark", yaxis_title="Average Score", font=dict(size=12) ) return fig # Leaderboard UI components def create_leaderboard_ui(leaderboard, db_manager): """Create the leaderboard UI components. Args: leaderboard: Leaderboard instance db_manager: Database manager instance Returns: gr.Blocks: Gradio Blocks component with leaderboard UI """ with gr.Blocks() as leaderboard_ui: gr.Markdown("# Dynamic Highscores Leaderboard") with gr.Row(): with gr.Column(scale=1): tag_filter = gr.Dropdown( choices=leaderboard.model_tags, value="All", label="Filter by Tag" ) benchmark_filter = gr.Dropdown( choices=[("all", "All Benchmarks")], value="all", label="Filter by Benchmark" ) refresh_button = gr.Button("Refresh Leaderboard") with gr.Column(scale=2): chart_type = gr.Radio( choices=["bar", "scatter"], value="bar", label="Chart Type" ) view_type = gr.Radio( choices=["Table", "Chart", "Dashboard"], value="Table", label="View Type" ) # Table view leaderboard_table = gr.Dataframe( headers=["Model", "Benchmark", "Tag", "Score", "Completed"], label="Leaderboard", visible=True ) # Chart view with gr.Row(visible=False) as chart_view: performance_chart = gr.Plot(label="Performance Chart") # Dashboard view with gr.Row(visible=False) as dashboard_view: with gr.Column(scale=2): dashboard_performance_chart = gr.Plot(label="Performance Comparison") with gr.Column(scale=1): with gr.Row(): tag_distribution_chart = gr.Plot(label="Model Distribution") with gr.Row(): benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison") # Event handlers def refresh_benchmarks(): try: benchmarks = db_manager.get_benchmarks() # Format for dropdown choices = [("all", "All Benchmarks")] choices.extend([(str(b["id"]), b["name"]) for b in benchmarks]) return gr.update(choices=choices) except Exception as e: print(f"Error refreshing benchmarks: {e}") return gr.update(choices=[("all", "All Benchmarks")]) def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val): try: # Get leaderboard data if benchmark_id == "all": benchmark_id = None df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id) # Format for display display_df = leaderboard.format_leaderboard_for_display(df) # Create charts perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val) tag_chart = leaderboard.create_tag_distribution_chart(df) benchmark_chart = leaderboard.create_benchmark_comparison_chart(df) # Update visibility based on view type table_visible = view_type_val == "Table" chart_visible = view_type_val == "Chart" dashboard_visible = view_type_val == "Dashboard" return ( display_df, perf_chart, perf_chart, # Same chart for both views tag_chart, benchmark_chart, gr.update(visible=table_visible), gr.update(visible=chart_visible), gr.update(visible=dashboard_visible) ) except Exception as e: print(f"Error updating leaderboard: {e}") empty_df = pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed']) empty_chart = go.Figure() empty_chart.update_layout(title="Error loading data") return ( empty_df, empty_chart, empty_chart, empty_chart, empty_chart, gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) ) # Connect event handlers refresh_button.click( fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t), inputs=[tag_filter, benchmark_filter, chart_type, view_type], outputs=[ leaderboard_table, performance_chart, dashboard_performance_chart, tag_distribution_chart, benchmark_comparison_chart, leaderboard_table, chart_view, dashboard_view ] ) view_type.change( fn=lambda view_t: ( gr.update(visible=view_t == "Table"), gr.update(visible=view_t == "Chart"), gr.update(visible=view_t == "Dashboard") ), inputs=[view_type], outputs=[leaderboard_table, chart_view, dashboard_view] ) # Initialize on load leaderboard_ui.load( fn=refresh_benchmarks, inputs=[], outputs=[benchmark_filter] ) leaderboard_ui.load( fn=lambda: update_leaderboard("All", "all", "bar", "Table"), inputs=[], outputs=[ leaderboard_table, performance_chart, dashboard_performance_chart, tag_distribution_chart, benchmark_comparison_chart, leaderboard_table, chart_view, dashboard_view ] ) return leaderboard_ui