Quazim0t0 commited on
Commit
31c30aa
·
verified ·
1 Parent(s): 7c42b46

Delete leaderboard.py

Browse files
Files changed (1) hide show
  1. leaderboard.py +0 -381
leaderboard.py DELETED
@@ -1,381 +0,0 @@
1
- """
2
- Leaderboard module for Dynamic Highscores system.
3
-
4
- This module implements the unified leaderboard with tag-based filtering
5
- for displaying all evaluated models.
6
- """
7
-
8
- import os
9
- import json
10
- import pandas as pd
11
- import gradio as gr
12
- import plotly.express as px
13
- import plotly.graph_objects as go
14
-
15
- class Leaderboard:
16
- """Manages the unified leaderboard with filtering capabilities."""
17
-
18
- def __init__(self, db_manager):
19
- """Initialize the leaderboard manager.
20
-
21
- Args:
22
- db_manager: Database manager instance
23
- """
24
- self.db_manager = db_manager
25
- self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
26
-
27
- # Define color scheme for tags
28
- self.tag_colors = {
29
- "Merge": "#FF6B6B",
30
- "Agent": "#4ECDC4",
31
- "Reasoning": "#FFD166",
32
- "Coding": "#6B5B95",
33
- "General": "#88D8B0",
34
- "Specialized": "#FF8C42",
35
- "Instruction": "#5D9CEC",
36
- "Chat": "#AC92EB"
37
- }
38
-
39
- def get_leaderboard_data(self, tag=None, benchmark_id=None):
40
- """Get leaderboard data, optionally filtered by tag or benchmark.
41
-
42
- Args:
43
- tag: Model tag to filter by (None for all)
44
- benchmark_id: Benchmark ID to filter by (None for all)
45
-
46
- Returns:
47
- pd.DataFrame: Leaderboard data
48
- """
49
- # Get evaluation results from database
50
- if tag and tag != "All":
51
- df = self.db_manager.get_leaderboard_df(tag=tag)
52
- else:
53
- df = self.db_manager.get_leaderboard_df()
54
-
55
- # Filter by benchmark if specified
56
- if benchmark_id and not df.empty:
57
- df = df[df['benchmark_id'] == benchmark_id]
58
-
59
- return df
60
-
61
- def format_leaderboard_for_display(self, df):
62
- """Format leaderboard data for display.
63
-
64
- Args:
65
- df: Leaderboard DataFrame
66
-
67
- Returns:
68
- pd.DataFrame: Formatted leaderboard for display
69
- """
70
- if df.empty:
71
- return pd.DataFrame()
72
-
73
- # Select and rename columns for display
74
- display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
75
- display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
76
-
77
- # Round score to 2 decimal places
78
- display_df['Score'] = display_df['Score'].round(2)
79
-
80
- # Sort by score (descending)
81
- display_df = display_df.sort_values('Score', ascending=False)
82
-
83
- return display_df
84
-
85
- def create_performance_chart(self, df, chart_type="bar"):
86
- """Create a performance chart from leaderboard data.
87
-
88
- Args:
89
- df: Leaderboard DataFrame
90
- chart_type: Type of chart to create ("bar" or "scatter")
91
-
92
- Returns:
93
- plotly.graph_objects.Figure: Performance chart
94
- """
95
- if df.empty:
96
- # Return empty figure
97
- fig = go.Figure()
98
- fig.update_layout(
99
- title="No data available",
100
- xaxis_title="Model",
101
- yaxis_title="Score"
102
- )
103
- return fig
104
-
105
- # Prepare data for visualization
106
- plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
107
- plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
108
-
109
- # Create chart based on type
110
- if chart_type == "scatter":
111
- fig = px.scatter(
112
- plot_df,
113
- x="Model",
114
- y="Score",
115
- color="Tag",
116
- symbol="Benchmark",
117
- size="Score",
118
- hover_data=["Model", "Benchmark", "Score"],
119
- color_discrete_map=self.tag_colors
120
- )
121
- else: # Default to bar chart
122
- fig = px.bar(
123
- plot_df,
124
- x="Model",
125
- y="Score",
126
- color="Tag",
127
- barmode="group",
128
- hover_data=["Model", "Benchmark", "Score"],
129
- color_discrete_map=self.tag_colors
130
- )
131
-
132
- # Customize layout
133
- fig.update_layout(
134
- title="Model Performance Comparison",
135
- xaxis_title="Model",
136
- yaxis_title="Score",
137
- legend_title="Tag",
138
- font=dict(size=12)
139
- )
140
-
141
- return fig
142
-
143
- def create_tag_distribution_chart(self, df):
144
- """Create a chart showing distribution of models by tag.
145
-
146
- Args:
147
- df: Leaderboard DataFrame
148
-
149
- Returns:
150
- plotly.graph_objects.Figure: Tag distribution chart
151
- """
152
- if df.empty:
153
- # Return empty figure
154
- fig = go.Figure()
155
- fig.update_layout(
156
- title="No data available",
157
- xaxis_title="Tag",
158
- yaxis_title="Count"
159
- )
160
- return fig
161
-
162
- # Count models by tag
163
- tag_counts = df['tag'].value_counts().reset_index()
164
- tag_counts.columns = ['Tag', 'Count']
165
-
166
- # Create pie chart
167
- fig = px.pie(
168
- tag_counts,
169
- names='Tag',
170
- values='Count',
171
- title='Model Distribution by Tag',
172
- color='Tag',
173
- color_discrete_map=self.tag_colors
174
- )
175
-
176
- # Customize layout
177
- fig.update_layout(
178
- font=dict(size=12)
179
- )
180
-
181
- return fig
182
-
183
- def create_benchmark_comparison_chart(self, df):
184
- """Create a chart comparing performance across benchmarks.
185
-
186
- Args:
187
- df: Leaderboard DataFrame
188
-
189
- Returns:
190
- plotly.graph_objects.Figure: Benchmark comparison chart
191
- """
192
- if df.empty:
193
- # Return empty figure
194
- fig = go.Figure()
195
- fig.update_layout(
196
- title="No data available",
197
- xaxis_title="Benchmark",
198
- yaxis_title="Average Score"
199
- )
200
- return fig
201
-
202
- # Calculate average score by benchmark
203
- benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
204
- benchmark_avg.columns = ['Benchmark', 'Average Score']
205
-
206
- # Create bar chart
207
- fig = px.bar(
208
- benchmark_avg,
209
- x='Benchmark',
210
- y='Average Score',
211
- title='Average Performance by Benchmark',
212
- color='Benchmark'
213
- )
214
-
215
- # Customize layout
216
- fig.update_layout(
217
- xaxis_title="Benchmark",
218
- yaxis_title="Average Score",
219
- font=dict(size=12)
220
- )
221
-
222
- return fig
223
-
224
- # Leaderboard UI components
225
- def create_leaderboard_ui(leaderboard, db_manager):
226
- """Create the leaderboard UI components.
227
-
228
- Args:
229
- leaderboard: Leaderboard instance
230
- db_manager: Database manager instance
231
-
232
- Returns:
233
- gr.Blocks: Gradio Blocks component with leaderboard UI
234
- """
235
- with gr.Blocks() as leaderboard_ui:
236
- gr.Markdown("# Dynamic Highscores Leaderboard")
237
-
238
- with gr.Row():
239
- with gr.Column(scale=1):
240
- tag_filter = gr.Dropdown(
241
- choices=leaderboard.model_tags,
242
- value="All",
243
- label="Filter by Tag"
244
- )
245
-
246
- benchmark_filter = gr.Dropdown(
247
- choices=[("all", "All Benchmarks")],
248
- value="all",
249
- label="Filter by Benchmark"
250
- )
251
-
252
- refresh_button = gr.Button("Refresh Leaderboard")
253
-
254
- with gr.Column(scale=2):
255
- chart_type = gr.Radio(
256
- choices=["bar", "scatter"],
257
- value="bar",
258
- label="Chart Type"
259
- )
260
-
261
- view_type = gr.Radio(
262
- choices=["Table", "Chart", "Dashboard"],
263
- value="Table",
264
- label="View Type"
265
- )
266
-
267
- # Table view
268
- leaderboard_table = gr.Dataframe(
269
- headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
270
- label="Leaderboard",
271
- visible=True
272
- )
273
-
274
- # Chart view
275
- with gr.Row(visible=False) as chart_view:
276
- performance_chart = gr.Plot(label="Performance Chart")
277
-
278
- # Dashboard view
279
- with gr.Row(visible=False) as dashboard_view:
280
- with gr.Column(scale=2):
281
- dashboard_performance_chart = gr.Plot(label="Performance Comparison")
282
-
283
- with gr.Column(scale=1):
284
- with gr.Row():
285
- tag_distribution_chart = gr.Plot(label="Model Distribution")
286
-
287
- with gr.Row():
288
- benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
289
-
290
- # Event handlers
291
- def refresh_benchmarks():
292
- benchmarks = db_manager.get_benchmarks()
293
-
294
- # Format for dropdown
295
- choices = [("all", "All Benchmarks")]
296
- choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
297
-
298
- return gr.update(choices=choices)
299
-
300
- def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
301
- # Get leaderboard data
302
- if benchmark_id == "all":
303
- benchmark_id = None
304
- else:
305
- benchmark_id = int(benchmark_id)
306
-
307
- df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
308
-
309
- # Format for display
310
- display_df = leaderboard.format_leaderboard_for_display(df)
311
-
312
- # Create charts
313
- perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
314
- tag_chart = leaderboard.create_tag_distribution_chart(df)
315
- benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
316
-
317
- # Update visibility based on view type
318
- table_visible = view_type_val == "Table"
319
- chart_visible = view_type_val == "Chart"
320
- dashboard_visible = view_type_val == "Dashboard"
321
-
322
- return (
323
- display_df,
324
- perf_chart,
325
- perf_chart, # Same chart for both views
326
- tag_chart,
327
- benchmark_chart,
328
- gr.update(visible=table_visible),
329
- gr.update(visible=chart_visible),
330
- gr.update(visible=dashboard_visible)
331
- )
332
-
333
- # Connect event handlers
334
- refresh_button.click(
335
- fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
336
- inputs=[tag_filter, benchmark_filter, chart_type, view_type],
337
- outputs=[
338
- leaderboard_table,
339
- performance_chart,
340
- dashboard_performance_chart,
341
- tag_distribution_chart,
342
- benchmark_comparison_chart,
343
- leaderboard_table,
344
- chart_view,
345
- dashboard_view
346
- ]
347
- )
348
-
349
- view_type.change(
350
- fn=lambda view_t: (
351
- gr.update(visible=view_t == "Table"),
352
- gr.update(visible=view_t == "Chart"),
353
- gr.update(visible=view_t == "Dashboard")
354
- ),
355
- inputs=[view_type],
356
- outputs=[leaderboard_table, chart_view, dashboard_view]
357
- )
358
-
359
- # Initialize on load
360
- leaderboard_ui.load(
361
- fn=refresh_benchmarks,
362
- inputs=[],
363
- outputs=[benchmark_filter]
364
- )
365
-
366
- leaderboard_ui.load(
367
- fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
368
- inputs=[],
369
- outputs=[
370
- leaderboard_table,
371
- performance_chart,
372
- dashboard_performance_chart,
373
- tag_distribution_chart,
374
- benchmark_comparison_chart,
375
- leaderboard_table,
376
- chart_view,
377
- dashboard_view
378
- ]
379
- )
380
-
381
- return leaderboard_ui