File size: 13,349 Bytes
9a46619
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
"""
Leaderboard module for Dynamic Highscores system.

This module implements the unified leaderboard with tag-based filtering
for displaying all evaluated models.
"""

import os
import json
import pandas as pd
import gradio as gr
import plotly.express as px
import plotly.graph_objects as go

class Leaderboard:
    """Manages the unified leaderboard with filtering capabilities."""
    
    def __init__(self, db_manager):
        """Initialize the leaderboard manager.
        
        Args:
            db_manager: Database manager instance
        """
        self.db_manager = db_manager
        self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
        
        # Define color scheme for tags
        self.tag_colors = {
            "Merge": "#FF6B6B",
            "Agent": "#4ECDC4",
            "Reasoning": "#FFD166",
            "Coding": "#6B5B95",
            "General": "#88D8B0",
            "Specialized": "#FF8C42",
            "Instruction": "#5D9CEC",
            "Chat": "#AC92EB"
        }
    
    def get_leaderboard_data(self, tag=None, benchmark_id=None):
        """Get leaderboard data, optionally filtered by tag or benchmark.
        
        Args:
            tag: Model tag to filter by (None for all)
            benchmark_id: Benchmark ID to filter by (None for all)
            
        Returns:
            pd.DataFrame: Leaderboard data
        """
        # Get evaluation results from database
        if tag and tag != "All":
            df = self.db_manager.get_leaderboard_df(tag=tag, benchmark_id=benchmark_id)
        else:
            df = self.db_manager.get_leaderboard_df(benchmark_id=benchmark_id)
        
        return df
    
    def format_leaderboard_for_display(self, df):
        """Format leaderboard data for display.
        
        Args:
            df: Leaderboard DataFrame
            
        Returns:
            pd.DataFrame: Formatted leaderboard for display
        """
        if df.empty:
            return pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
        
        # Select and rename columns for display
        display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
        display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
        
        # Round score to 2 decimal places
        display_df['Score'] = display_df['Score'].round(2)
        
        # Sort by score (descending)
        display_df = display_df.sort_values('Score', ascending=False)
        
        return display_df
    
    def create_performance_chart(self, df, chart_type="bar"):
        """Create a performance chart from leaderboard data.
        
        Args:
            df: Leaderboard DataFrame
            chart_type: Type of chart to create ("bar" or "scatter")
            
        Returns:
            plotly.graph_objects.Figure: Performance chart
        """
        if df.empty:
            # Return empty figure
            fig = go.Figure()
            fig.update_layout(
                title="No data available",
                xaxis_title="Model",
                yaxis_title="Score"
            )
            return fig
        
        # Prepare data for visualization
        plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
        plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
        
        # Create chart based on type
        if chart_type == "scatter":
            fig = px.scatter(
                plot_df,
                x="Model",
                y="Score",
                color="Tag",
                symbol="Benchmark",
                size="Score",
                hover_data=["Model", "Benchmark", "Score"],
                color_discrete_map=self.tag_colors
            )
        else:  # Default to bar chart
            fig = px.bar(
                plot_df,
                x="Model",
                y="Score",
                color="Tag",
                barmode="group",
                hover_data=["Model", "Benchmark", "Score"],
                color_discrete_map=self.tag_colors
            )
        
        # Customize layout
        fig.update_layout(
            title="Model Performance Comparison",
            xaxis_title="Model",
            yaxis_title="Score",
            legend_title="Tag",
            font=dict(size=12)
        )
        
        return fig
    
    def create_tag_distribution_chart(self, df):
        """Create a chart showing distribution of models by tag.
        
        Args:
            df: Leaderboard DataFrame
            
        Returns:
            plotly.graph_objects.Figure: Tag distribution chart
        """
        if df.empty:
            # Return empty figure
            fig = go.Figure()
            fig.update_layout(
                title="No data available",
                xaxis_title="Tag",
                yaxis_title="Count"
            )
            return fig
        
        # Count models by tag
        tag_counts = df['tag'].value_counts().reset_index()
        tag_counts.columns = ['Tag', 'Count']
        
        # Create pie chart
        fig = px.pie(
            tag_counts,
            names='Tag',
            values='Count',
            title='Model Distribution by Tag',
            color='Tag',
            color_discrete_map=self.tag_colors
        )
        
        # Customize layout
        fig.update_layout(
            font=dict(size=12)
        )
        
        return fig
    
    def create_benchmark_comparison_chart(self, df):
        """Create a chart comparing performance across benchmarks.
        
        Args:
            df: Leaderboard DataFrame
            
        Returns:
            plotly.graph_objects.Figure: Benchmark comparison chart
        """
        if df.empty:
            # Return empty figure
            fig = go.Figure()
            fig.update_layout(
                title="No data available",
                xaxis_title="Benchmark",
                yaxis_title="Average Score"
            )
            return fig
        
        # Calculate average score by benchmark
        benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
        benchmark_avg.columns = ['Benchmark', 'Average Score']
        
        # Create bar chart
        fig = px.bar(
            benchmark_avg,
            x='Benchmark',
            y='Average Score',
            title='Average Performance by Benchmark',
            color='Benchmark'
        )
        
        # Customize layout
        fig.update_layout(
            xaxis_title="Benchmark",
            yaxis_title="Average Score",
            font=dict(size=12)
        )
        
        return fig

# Leaderboard UI components
def create_leaderboard_ui(leaderboard, db_manager):
    """Create the leaderboard UI components.
    
    Args:
        leaderboard: Leaderboard instance
        db_manager: Database manager instance
        
    Returns:
        gr.Blocks: Gradio Blocks component with leaderboard UI
    """
    with gr.Blocks() as leaderboard_ui:
        gr.Markdown("# Dynamic Highscores Leaderboard")
        
        with gr.Row():
            with gr.Column(scale=1):
                tag_filter = gr.Dropdown(
                    choices=leaderboard.model_tags,
                    value="All",
                    label="Filter by Tag"
                )
                
                benchmark_filter = gr.Dropdown(
                    choices=[("all", "All Benchmarks")],
                    value="all",
                    label="Filter by Benchmark"
                )
                
                refresh_button = gr.Button("Refresh Leaderboard")
            
            with gr.Column(scale=2):
                chart_type = gr.Radio(
                    choices=["bar", "scatter"],
                    value="bar",
                    label="Chart Type"
                )
                
                view_type = gr.Radio(
                    choices=["Table", "Chart", "Dashboard"],
                    value="Table",
                    label="View Type"
                )
        
        # Table view
        leaderboard_table = gr.Dataframe(
            headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
            label="Leaderboard",
            visible=True
        )
        
        # Chart view
        with gr.Row(visible=False) as chart_view:
            performance_chart = gr.Plot(label="Performance Chart")
        
        # Dashboard view
        with gr.Row(visible=False) as dashboard_view:
            with gr.Column(scale=2):
                dashboard_performance_chart = gr.Plot(label="Performance Comparison")
            
            with gr.Column(scale=1):
                with gr.Row():
                    tag_distribution_chart = gr.Plot(label="Model Distribution")
                
                with gr.Row():
                    benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
        
        # Event handlers
        def refresh_benchmarks():
            try:
                benchmarks = db_manager.get_benchmarks()
                
                # Format for dropdown
                choices = [("all", "All Benchmarks")]
                choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
                
                return gr.update(choices=choices)
            except Exception as e:
                print(f"Error refreshing benchmarks: {e}")
                return gr.update(choices=[("all", "All Benchmarks")])
        
        def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
            try:
                # Get leaderboard data
                if benchmark_id == "all":
                    benchmark_id = None
                
                df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
                
                # Format for display
                display_df = leaderboard.format_leaderboard_for_display(df)
                
                # Create charts
                perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
                tag_chart = leaderboard.create_tag_distribution_chart(df)
                benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
                
                # Update visibility based on view type
                table_visible = view_type_val == "Table"
                chart_visible = view_type_val == "Chart"
                dashboard_visible = view_type_val == "Dashboard"
                
                return (
                    display_df,
                    perf_chart,
                    perf_chart,  # Same chart for both views
                    tag_chart,
                    benchmark_chart,
                    gr.update(visible=table_visible),
                    gr.update(visible=chart_visible),
                    gr.update(visible=dashboard_visible)
                )
            except Exception as e:
                print(f"Error updating leaderboard: {e}")
                empty_df = pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
                empty_chart = go.Figure()
                empty_chart.update_layout(title="Error loading data")
                
                return (
                    empty_df,
                    empty_chart,
                    empty_chart,
                    empty_chart,
                    empty_chart,
                    gr.update(visible=True),
                    gr.update(visible=False),
                    gr.update(visible=False)
                )
        
        # Connect event handlers
        refresh_button.click(
            fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
            inputs=[tag_filter, benchmark_filter, chart_type, view_type],
            outputs=[
                leaderboard_table,
                performance_chart,
                dashboard_performance_chart,
                tag_distribution_chart,
                benchmark_comparison_chart,
                leaderboard_table,
                chart_view,
                dashboard_view
            ]
        )
        
        view_type.change(
            fn=lambda view_t: (
                gr.update(visible=view_t == "Table"),
                gr.update(visible=view_t == "Chart"),
                gr.update(visible=view_t == "Dashboard")
            ),
            inputs=[view_type],
            outputs=[leaderboard_table, chart_view, dashboard_view]
        )
        
        # Initialize on load
        leaderboard_ui.load(
            fn=refresh_benchmarks,
            inputs=[],
            outputs=[benchmark_filter]
        )
        
        leaderboard_ui.load(
            fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
            inputs=[],
            outputs=[
                leaderboard_table,
                performance_chart,
                dashboard_performance_chart,
                tag_distribution_chart,
                benchmark_comparison_chart,
                leaderboard_table,
                chart_view,
                dashboard_view
            ]
        )
    
    return leaderboard_ui