Quazim0t0 commited on
Commit
d1078a3
·
verified ·
1 Parent(s): e14f110

Upload 3 files

Browse files
Files changed (3) hide show
  1. database_schema.py +393 -0
  2. evaluation_queue.py +947 -0
  3. leaderboard.py +381 -0
database_schema.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database schema for Dynamic Highscores system.
3
+
4
+ This module defines the SQLite database schema for the Dynamic Highscores system,
5
+ which integrates benchmark selection, model evaluation, and leaderboard functionality.
6
+ """
7
+
8
+ import sqlite3
9
+ import os
10
+ import json
11
+ from datetime import datetime, timedelta
12
+ import pandas as pd
13
+
14
+ class DynamicHighscoresDB:
15
+ """Database manager for the Dynamic Highscores system."""
16
+
17
+ def __init__(self, db_path="dynamic_highscores.db"):
18
+ """Initialize the database connection and create tables if they don't exist."""
19
+ self.db_path = db_path
20
+ self.conn = None
21
+ self.cursor = None
22
+ self.connect()
23
+ self.create_tables()
24
+
25
+ def connect(self):
26
+ """Connect to the SQLite database."""
27
+ self.conn = sqlite3.connect(self.db_path)
28
+ self.conn.row_factory = sqlite3.Row
29
+ self.cursor = self.conn.cursor()
30
+
31
+ def close(self):
32
+ """Close the database connection."""
33
+ if self.conn:
34
+ self.conn.close()
35
+
36
+ def create_tables(self):
37
+ """Create all necessary tables if they don't exist."""
38
+ # Users table - stores user information
39
+ self.cursor.execute('''
40
+ CREATE TABLE IF NOT EXISTS users (
41
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
42
+ username TEXT UNIQUE NOT NULL,
43
+ hf_user_id TEXT UNIQUE NOT NULL,
44
+ is_admin BOOLEAN DEFAULT 0,
45
+ last_submission_date TEXT,
46
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
47
+ )
48
+ ''')
49
+
50
+ # Benchmarks table - stores information about available benchmarks
51
+ self.cursor.execute('''
52
+ CREATE TABLE IF NOT EXISTS benchmarks (
53
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
54
+ name TEXT NOT NULL,
55
+ dataset_id TEXT NOT NULL,
56
+ description TEXT,
57
+ metrics TEXT, -- JSON string of metrics
58
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
59
+ )
60
+ ''')
61
+
62
+ # Models table - stores information about submitted models
63
+ self.cursor.execute('''
64
+ CREATE TABLE IF NOT EXISTS models (
65
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
66
+ name TEXT NOT NULL,
67
+ hf_model_id TEXT NOT NULL,
68
+ user_id INTEGER NOT NULL,
69
+ tag TEXT NOT NULL, -- One of: Merge, Agent, Reasoning, Coding, etc.
70
+ parameters TEXT, -- Number of parameters (can be NULL)
71
+ description TEXT,
72
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
73
+ FOREIGN KEY (user_id) REFERENCES users (id),
74
+ UNIQUE (hf_model_id, user_id)
75
+ )
76
+ ''')
77
+
78
+ # Evaluations table - stores evaluation results
79
+ self.cursor.execute('''
80
+ CREATE TABLE IF NOT EXISTS evaluations (
81
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
82
+ model_id INTEGER NOT NULL,
83
+ benchmark_id INTEGER NOT NULL,
84
+ status TEXT NOT NULL, -- pending, running, completed, failed
85
+ results TEXT, -- JSON string of results
86
+ score REAL, -- Overall score (can be NULL)
87
+ submitted_at TEXT DEFAULT CURRENT_TIMESTAMP,
88
+ completed_at TEXT,
89
+ FOREIGN KEY (model_id) REFERENCES models (id),
90
+ FOREIGN KEY (benchmark_id) REFERENCES benchmarks (id)
91
+ )
92
+ ''')
93
+
94
+ # Queue table - stores evaluation queue
95
+ self.cursor.execute('''
96
+ CREATE TABLE IF NOT EXISTS queue (
97
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
98
+ evaluation_id INTEGER NOT NULL,
99
+ priority INTEGER DEFAULT 0, -- Higher number = higher priority
100
+ added_at TEXT DEFAULT CURRENT_TIMESTAMP,
101
+ FOREIGN KEY (evaluation_id) REFERENCES evaluations (id)
102
+ )
103
+ ''')
104
+
105
+ self.conn.commit()
106
+
107
+ # User management methods
108
+ def add_user(self, username, hf_user_id, is_admin=False):
109
+ """Add a new user to the database."""
110
+ try:
111
+ self.cursor.execute(
112
+ "INSERT INTO users (username, hf_user_id, is_admin) VALUES (?, ?, ?)",
113
+ (username, hf_user_id, is_admin)
114
+ )
115
+ self.conn.commit()
116
+ return self.cursor.lastrowid
117
+ except sqlite3.IntegrityError:
118
+ # User already exists
119
+ self.cursor.execute(
120
+ "SELECT id FROM users WHERE hf_user_id = ?",
121
+ (hf_user_id,)
122
+ )
123
+ return self.cursor.fetchone()[0]
124
+
125
+ def get_user(self, hf_user_id):
126
+ """Get user information by HuggingFace user ID."""
127
+ self.cursor.execute(
128
+ "SELECT * FROM users WHERE hf_user_id = ?",
129
+ (hf_user_id,)
130
+ )
131
+ return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
132
+
133
+ def can_submit_today(self, user_id):
134
+ """Check if a user can submit a benchmark evaluation today."""
135
+ self.cursor.execute(
136
+ "SELECT is_admin, last_submission_date FROM users WHERE id = ?",
137
+ (user_id,)
138
+ )
139
+ result = self.cursor.fetchone()
140
+
141
+ if not result:
142
+ return False
143
+
144
+ user_data = dict(result)
145
+
146
+ # Admin can always submit
147
+ if user_data['is_admin']:
148
+ return True
149
+
150
+ # If no previous submission, user can submit
151
+ if not user_data['last_submission_date']:
152
+ return True
153
+
154
+ # Check if last submission was before today
155
+ last_date = datetime.fromisoformat(user_data['last_submission_date'])
156
+ today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
157
+
158
+ return last_date < today
159
+
160
+ def update_submission_date(self, user_id):
161
+ """Update the last submission date for a user."""
162
+ current_time = datetime.now().isoformat()
163
+ self.cursor.execute(
164
+ "UPDATE users SET last_submission_date = ? WHERE id = ?",
165
+ (current_time, user_id)
166
+ )
167
+ self.conn.commit()
168
+
169
+ # Benchmark management methods
170
+ def add_benchmark(self, name, dataset_id, description="", metrics=None):
171
+ """Add a new benchmark to the database."""
172
+ if metrics is None:
173
+ metrics = {}
174
+
175
+ metrics_json = json.dumps(metrics)
176
+
177
+ try:
178
+ self.cursor.execute(
179
+ "INSERT INTO benchmarks (name, dataset_id, description, metrics) VALUES (?, ?, ?, ?)",
180
+ (name, dataset_id, description, metrics_json)
181
+ )
182
+ self.conn.commit()
183
+ return self.cursor.lastrowid
184
+ except sqlite3.IntegrityError:
185
+ # Benchmark already exists with this dataset_id
186
+ self.cursor.execute(
187
+ "SELECT id FROM benchmarks WHERE dataset_id = ?",
188
+ (dataset_id,)
189
+ )
190
+ return self.cursor.fetchone()[0]
191
+
192
+ def get_benchmarks(self):
193
+ """Get all available benchmarks."""
194
+ self.cursor.execute("SELECT * FROM benchmarks")
195
+ benchmarks = [dict(row) for row in self.cursor.fetchall()]
196
+
197
+ # Parse metrics JSON
198
+ for benchmark in benchmarks:
199
+ benchmark['metrics'] = json.loads(benchmark['metrics'])
200
+
201
+ return benchmarks
202
+
203
+ def get_benchmark(self, benchmark_id):
204
+ """Get benchmark information by ID."""
205
+ self.cursor.execute(
206
+ "SELECT * FROM benchmarks WHERE id = ?",
207
+ (benchmark_id,)
208
+ )
209
+ benchmark = dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
210
+
211
+ if benchmark:
212
+ benchmark['metrics'] = json.loads(benchmark['metrics'])
213
+
214
+ return benchmark
215
+
216
+ # Model management methods
217
+ def add_model(self, name, hf_model_id, user_id, tag, parameters=None, description=""):
218
+ """Add a new model to the database."""
219
+ try:
220
+ self.cursor.execute(
221
+ "INSERT INTO models (name, hf_model_id, user_id, tag, parameters, description) VALUES (?, ?, ?, ?, ?, ?)",
222
+ (name, hf_model_id, user_id, tag, parameters, description)
223
+ )
224
+ self.conn.commit()
225
+ return self.cursor.lastrowid
226
+ except sqlite3.IntegrityError:
227
+ # Model already exists for this user
228
+ self.cursor.execute(
229
+ "SELECT id FROM models WHERE hf_model_id = ? AND user_id = ?",
230
+ (hf_model_id, user_id)
231
+ )
232
+ return self.cursor.fetchone()[0]
233
+
234
+ def get_models(self, tag=None):
235
+ """Get all models, optionally filtered by tag."""
236
+ if tag:
237
+ self.cursor.execute(
238
+ "SELECT * FROM models WHERE tag = ?",
239
+ (tag,)
240
+ )
241
+ else:
242
+ self.cursor.execute("SELECT * FROM models")
243
+
244
+ return [dict(row) for row in self.cursor.fetchall()]
245
+
246
+ def get_model(self, model_id):
247
+ """Get model information by ID."""
248
+ self.cursor.execute(
249
+ "SELECT * FROM models WHERE id = ?",
250
+ (model_id,)
251
+ )
252
+ return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
253
+
254
+ # Evaluation management methods
255
+ def add_evaluation(self, model_id, benchmark_id, priority=0):
256
+ """Add a new evaluation to the database and queue."""
257
+ # First, add the evaluation
258
+ self.cursor.execute(
259
+ "INSERT INTO evaluations (model_id, benchmark_id, status) VALUES (?, ?, 'pending')",
260
+ (model_id, benchmark_id)
261
+ )
262
+ evaluation_id = self.cursor.lastrowid
263
+
264
+ # Then, add it to the queue
265
+ self.cursor.execute(
266
+ "INSERT INTO queue (evaluation_id, priority) VALUES (?, ?)",
267
+ (evaluation_id, priority)
268
+ )
269
+
270
+ self.conn.commit()
271
+ return evaluation_id
272
+
273
+ def update_evaluation_status(self, evaluation_id, status, results=None, score=None):
274
+ """Update the status of an evaluation."""
275
+ params = [status, evaluation_id]
276
+ sql = "UPDATE evaluations SET status = ?"
277
+
278
+ if results is not None:
279
+ sql += ", results = ?"
280
+ params.insert(1, json.dumps(results))
281
+
282
+ if score is not None:
283
+ sql += ", score = ?"
284
+ params.insert(1 if results is None else 2, score)
285
+
286
+ if status in ['completed', 'failed']:
287
+ sql += ", completed_at = ?"
288
+ params.insert(1 if results is None and score is None else (2 if results is None or score is None else 3),
289
+ datetime.now().isoformat())
290
+
291
+ sql += " WHERE id = ?"
292
+
293
+ self.cursor.execute(sql, params)
294
+ self.conn.commit()
295
+
296
+ # If completed or failed, remove from queue
297
+ if status in ['completed', 'failed']:
298
+ self.cursor.execute(
299
+ "DELETE FROM queue WHERE evaluation_id = ?",
300
+ (evaluation_id,)
301
+ )
302
+ self.conn.commit()
303
+
304
+ def get_next_in_queue(self):
305
+ """Get the next evaluation in the queue."""
306
+ self.cursor.execute("""
307
+ SELECT q.id as queue_id, q.evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
308
+ FROM queue q
309
+ JOIN evaluations e ON q.evaluation_id = e.id
310
+ JOIN models m ON e.model_id = m.id
311
+ JOIN benchmarks b ON e.benchmark_id = b.id
312
+ WHERE e.status = 'pending'
313
+ ORDER BY q.priority DESC, q.added_at ASC
314
+ LIMIT 1
315
+ """)
316
+
317
+ result = self.cursor.fetchone()
318
+ return dict(result) if result else None
319
+
320
+ def get_evaluation_results(self, model_id=None, benchmark_id=None, tag=None):
321
+ """Get evaluation results, optionally filtered by model, benchmark, or tag."""
322
+ sql = """
323
+ SELECT e.id, e.model_id, e.benchmark_id, e.status, e.results, e.score,
324
+ e.submitted_at, e.completed_at, m.name as model_name, m.tag,
325
+ b.name as benchmark_name
326
+ FROM evaluations e
327
+ JOIN models m ON e.model_id = m.id
328
+ JOIN benchmarks b ON e.benchmark_id = b.id
329
+ WHERE e.status = 'completed'
330
+ """
331
+
332
+ params = []
333
+
334
+ if model_id:
335
+ sql += " AND e.model_id = ?"
336
+ params.append(model_id)
337
+
338
+ if benchmark_id:
339
+ sql += " AND e.benchmark_id = ?"
340
+ params.append(benchmark_id)
341
+
342
+ if tag:
343
+ sql += " AND m.tag = ?"
344
+ params.append(tag)
345
+
346
+ sql += " ORDER BY e.completed_at DESC"
347
+
348
+ self.cursor.execute(sql, params)
349
+ results = [dict(row) for row in self.cursor.fetchall()]
350
+
351
+ # Parse results JSON
352
+ for result in results:
353
+ if result['results']:
354
+ result['results'] = json.loads(result['results'])
355
+
356
+ return results
357
+
358
+ def get_leaderboard_df(self, tag=None):
359
+ """Get a pandas DataFrame of the leaderboard, optionally filtered by tag."""
360
+ results = self.get_evaluation_results(tag=tag)
361
+
362
+ if not results:
363
+ return pd.DataFrame()
364
+
365
+ # Create a list of dictionaries for the DataFrame
366
+ leaderboard_data = []
367
+
368
+ for result in results:
369
+ entry = {
370
+ 'model_name': result['model_name'],
371
+ 'model_id': result['model_id'],
372
+ 'benchmark_name': result['benchmark_name'],
373
+ 'benchmark_id': result['benchmark_id'],
374
+ 'tag': result['tag'],
375
+ 'score': result['score'],
376
+ 'completed_at': result['completed_at']
377
+ }
378
+
379
+ # Add individual metrics from results
380
+ if result['results'] and isinstance(result['results'], dict):
381
+ for metric, value in result['results'].items():
382
+ if isinstance(value, (int, float)):
383
+ entry[f'metric_{metric}'] = value
384
+
385
+ leaderboard_data.append(entry)
386
+
387
+ return pd.DataFrame(leaderboard_data)
388
+
389
+ # Initialize the database
390
+ def init_db(db_path="dynamic_highscores.db"):
391
+ """Initialize the database and return the database manager."""
392
+ db = DynamicHighscoresDB(db_path)
393
+ return db
evaluation_queue.py ADDED
@@ -0,0 +1,947 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model evaluation queue system for Dynamic Highscores.
3
+
4
+ This module handles the evaluation queue, CPU-only processing,
5
+ and enforces daily submission limits for users.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import time
11
+ import threading
12
+ import queue
13
+ from datetime import datetime, timedelta
14
+ import gradio as gr
15
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
16
+ from datasets import load_dataset
17
+ import torch
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
+ import sqlite3
20
+
21
+ class EvaluationQueue:
22
+ """Manages the evaluation queue for model benchmarking."""
23
+
24
+ def __init__(self, db_manager, auth_manager):
25
+ """Initialize the evaluation queue manager.
26
+
27
+ Args:
28
+ db_manager: Database manager instance
29
+ auth_manager: Authentication manager instance
30
+ """
31
+ self.db_manager = db_manager
32
+ self.auth_manager = auth_manager
33
+ self.hf_api = HfApi()
34
+ self.queue = queue.Queue()
35
+ self.is_processing = False
36
+ self.worker_thread = None
37
+ self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
38
+ self.current_evaluation = None
39
+ self.progress = 0
40
+ self.progress_lock = threading.Lock()
41
+ self.db_path = db_manager.db_path # Store the path to create new connections in worker thread
42
+
43
+ def start_worker(self):
44
+ """Start the worker thread for processing the evaluation queue."""
45
+ if self.worker_thread is None or not self.worker_thread.is_alive():
46
+ self.is_processing = True
47
+ self.worker_thread = threading.Thread(target=self._process_queue)
48
+ self.worker_thread.daemon = True
49
+ self.worker_thread.start()
50
+
51
+ def stop_worker(self):
52
+ """Stop the worker thread."""
53
+ self.is_processing = False
54
+ if self.worker_thread and self.worker_thread.is_alive():
55
+ self.worker_thread.join(timeout=1.0)
56
+
57
+ def _process_queue(self):
58
+ """Process the evaluation queue in a separate thread."""
59
+ # Create a new database connection for this thread
60
+ thread_db = sqlite3.connect(self.db_path)
61
+ thread_db.row_factory = sqlite3.Row
62
+
63
+ while self.is_processing:
64
+ try:
65
+ # Get the next evaluation from the database using thread-local connection
66
+ cursor = thread_db.cursor()
67
+ cursor.execute("""
68
+ SELECT e.id as evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
69
+ FROM queue q
70
+ JOIN evaluations e ON q.evaluation_id = e.id
71
+ JOIN models m ON e.model_id = m.id
72
+ JOIN benchmarks b ON e.benchmark_id = b.id
73
+ WHERE e.status = 'pending'
74
+ ORDER BY q.priority DESC, q.added_at ASC
75
+ LIMIT 1
76
+ """)
77
+ row = cursor.fetchone()
78
+
79
+ if row:
80
+ next_eval = dict(row)
81
+
82
+ # Update status to running
83
+ cursor.execute("""
84
+ UPDATE evaluations
85
+ SET status = 'running', started_at = datetime('now')
86
+ WHERE id = ?
87
+ """, (next_eval['evaluation_id'],))
88
+ thread_db.commit()
89
+
90
+ # Set current evaluation and reset progress
91
+ with self.progress_lock:
92
+ self.current_evaluation = next_eval
93
+ self.progress = 0
94
+
95
+ try:
96
+ # Run the evaluation
97
+ results = self._run_evaluation(
98
+ next_eval['hf_model_id'],
99
+ next_eval['dataset_id']
100
+ )
101
+
102
+ # Calculate overall score
103
+ score = self._calculate_overall_score(results)
104
+
105
+ # Update status to completed with results
106
+ cursor.execute("""
107
+ UPDATE evaluations
108
+ SET status = 'completed',
109
+ completed_at = datetime('now'),
110
+ results = ?,
111
+ score = ?
112
+ WHERE id = ?
113
+ """, (json.dumps(results), score, next_eval['evaluation_id']))
114
+ thread_db.commit()
115
+ except Exception as e:
116
+ print(f"Evaluation error: {e}")
117
+ # Update status to failed
118
+ cursor.execute("""
119
+ UPDATE evaluations
120
+ SET status = 'failed', completed_at = datetime('now')
121
+ WHERE id = ?
122
+ """, (next_eval['evaluation_id'],))
123
+ thread_db.commit()
124
+
125
+ # Clear current evaluation
126
+ with self.progress_lock:
127
+ self.current_evaluation = None
128
+ self.progress = 0
129
+ else:
130
+ # No evaluations in queue, sleep for a bit
131
+ time.sleep(5)
132
+ except Exception as e:
133
+ print(f"Queue processing error: {e}")
134
+ time.sleep(5)
135
+
136
+ # Close the thread-local database connection
137
+ thread_db.close()
138
+
139
+ def _run_evaluation(self, model_id, dataset_id):
140
+ """Run an evaluation for a model on a benchmark.
141
+
142
+ Args:
143
+ model_id: HuggingFace model ID
144
+ dataset_id: HuggingFace dataset ID (with optional config)
145
+
146
+ Returns:
147
+ dict: Evaluation results
148
+ """
149
+ # Update progress
150
+ with self.progress_lock:
151
+ self.progress = 5 # Starting evaluation
152
+
153
+ # Parse dataset ID and config
154
+ if ":" in dataset_id:
155
+ dataset_id, config = dataset_id.split(":", 1)
156
+ else:
157
+ config = None
158
+
159
+ # Update progress
160
+ with self.progress_lock:
161
+ self.progress = 10 # Loading dataset
162
+
163
+ # Load the dataset
164
+ if config:
165
+ dataset = load_dataset(dataset_id, config, split="test")
166
+ else:
167
+ dataset = load_dataset(dataset_id, split="test")
168
+
169
+ # Update progress
170
+ with self.progress_lock:
171
+ self.progress = 20 # Loading model
172
+
173
+ # Load the model (CPU only)
174
+ device = "cpu"
175
+ model = AutoModelForCausalLM.from_pretrained(
176
+ model_id,
177
+ device_map=device,
178
+ torch_dtype=torch.float32, # Use float32 for CPU
179
+ low_cpu_mem_usage=True
180
+ )
181
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
182
+
183
+ # Update progress
184
+ with self.progress_lock:
185
+ self.progress = 30 # Determining task type
186
+
187
+ # Determine task type based on dataset features
188
+ task_type = self._determine_task_type(dataset)
189
+
190
+ # Update progress
191
+ with self.progress_lock:
192
+ self.progress = 40 # Starting evaluation
193
+
194
+ # Run appropriate evaluation based on task type
195
+ if task_type == "text-generation":
196
+ results = self._evaluate_text_generation(model, tokenizer, dataset)
197
+ elif task_type == "question-answering":
198
+ results = self._evaluate_question_answering(model, tokenizer, dataset)
199
+ elif task_type == "classification":
200
+ results = self._evaluate_classification(model, tokenizer, dataset)
201
+ elif task_type == "code-generation":
202
+ results = self._evaluate_code_generation(model, tokenizer, dataset)
203
+ else:
204
+ # Default to general evaluation
205
+ results = self._evaluate_general(model, tokenizer, dataset)
206
+
207
+ # Update progress
208
+ with self.progress_lock:
209
+ self.progress = 95 # Cleaning up
210
+
211
+ # Clean up to free memory
212
+ del model
213
+ del tokenizer
214
+ torch.cuda.empty_cache()
215
+
216
+ # Update progress
217
+ with self.progress_lock:
218
+ self.progress = 100 # Completed
219
+
220
+ return results
221
+
222
+ def get_current_progress(self):
223
+ """Get the current evaluation progress.
224
+
225
+ Returns:
226
+ tuple: (current_evaluation, progress_percentage)
227
+ """
228
+ with self.progress_lock:
229
+ return self.current_evaluation, self.progress
230
+
231
+ def _determine_task_type(self, dataset):
232
+ """Determine the task type based on dataset features.
233
+
234
+ Args:
235
+ dataset: HuggingFace dataset
236
+
237
+ Returns:
238
+ str: Task type
239
+ """
240
+ features = dataset.features
241
+
242
+ # Check for common feature patterns
243
+ if "question" in features and "answer" in features:
244
+ return "question-answering"
245
+ elif "code" in features or "solution" in features:
246
+ return "code-generation"
247
+ elif "label" in features or "class" in features:
248
+ return "classification"
249
+ elif "input" in features and "output" in features:
250
+ return "text-generation"
251
+ else:
252
+ return "general"
253
+
254
+ def _evaluate_text_generation(self, model, tokenizer, dataset):
255
+ """Evaluate a model on text generation tasks.
256
+
257
+ Args:
258
+ model: HuggingFace model
259
+ tokenizer: HuggingFace tokenizer
260
+ dataset: HuggingFace dataset
261
+
262
+ Returns:
263
+ dict: Evaluation results
264
+ """
265
+ # Set up generation pipeline
266
+ generator = pipeline(
267
+ "text-generation",
268
+ model=model,
269
+ tokenizer=tokenizer,
270
+ device="cpu"
271
+ )
272
+
273
+ # Sample a subset for evaluation (to keep runtime reasonable)
274
+ if len(dataset) > 100:
275
+ dataset = dataset.select(range(100))
276
+
277
+ # Track metrics
278
+ correct = 0
279
+ total = 0
280
+ generated_texts = []
281
+
282
+ # Process each example
283
+ for i, example in enumerate(dataset):
284
+ # Update progress based on completion percentage
285
+ with self.progress_lock:
286
+ self.progress = 40 + int((i / len(dataset)) * 50)
287
+
288
+ input_text = example.get("input", example.get("prompt", ""))
289
+ expected_output = example.get("output", example.get("target", ""))
290
+
291
+ if not input_text or not expected_output:
292
+ continue
293
+
294
+ # Generate text
295
+ generated = generator(
296
+ input_text,
297
+ max_length=100,
298
+ num_return_sequences=1
299
+ )
300
+
301
+ generated_text = generated[0]["generated_text"]
302
+ generated_texts.append(generated_text)
303
+
304
+ # Simple exact match check
305
+ if expected_output.strip() in generated_text:
306
+ correct += 1
307
+
308
+ total += 1
309
+
310
+ # Calculate metrics
311
+ accuracy = correct / total if total > 0 else 0
312
+
313
+ return {
314
+ "accuracy": accuracy,
315
+ "samples_evaluated": total,
316
+ "generated_samples": generated_texts[:5] # Include a few samples
317
+ }
318
+
319
+ def _evaluate_question_answering(self, model, tokenizer, dataset):
320
+ """Evaluate a model on question answering tasks.
321
+
322
+ Args:
323
+ model: HuggingFace model
324
+ tokenizer: HuggingFace tokenizer
325
+ dataset: HuggingFace dataset
326
+
327
+ Returns:
328
+ dict: Evaluation results
329
+ """
330
+ # Set up QA pipeline
331
+ qa_pipeline = pipeline(
332
+ "question-answering",
333
+ model=model,
334
+ tokenizer=tokenizer,
335
+ device="cpu"
336
+ )
337
+
338
+ # Sample a subset for evaluation
339
+ if len(dataset) > 100:
340
+ dataset = dataset.select(range(100))
341
+
342
+ # Track metrics
343
+ exact_matches = 0
344
+ f1_scores = []
345
+ total = 0
346
+
347
+ # Process each example
348
+ for i, example in enumerate(dataset):
349
+ # Update progress based on completion percentage
350
+ with self.progress_lock:
351
+ self.progress = 40 + int((i / len(dataset)) * 50)
352
+
353
+ question = example.get("question", "")
354
+ context = example.get("context", "")
355
+ answer = example.get("answer", "")
356
+
357
+ if not question or not answer:
358
+ continue
359
+
360
+ # Get model prediction
361
+ if context:
362
+ result = qa_pipeline(question=question, context=context)
363
+ else:
364
+ # If no context provided, use the question as context
365
+ result = qa_pipeline(question=question, context=question)
366
+
367
+ predicted_answer = result["answer"]
368
+
369
+ # Calculate exact match
370
+ if predicted_answer.strip() == answer.strip():
371
+ exact_matches += 1
372
+
373
+ # Calculate F1 score
374
+ f1 = self._calculate_f1(answer, predicted_answer)
375
+ f1_scores.append(f1)
376
+
377
+ total += 1
378
+
379
+ # Calculate metrics
380
+ exact_match_accuracy = exact_matches / total if total > 0 else 0
381
+ avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
382
+
383
+ return {
384
+ "exact_match": exact_match_accuracy,
385
+ "f1": avg_f1,
386
+ "samples_evaluated": total
387
+ }
388
+
389
+ def _evaluate_classification(self, model, tokenizer, dataset):
390
+ """Evaluate a model on classification tasks.
391
+
392
+ Args:
393
+ model: HuggingFace model
394
+ tokenizer: HuggingFace tokenizer
395
+ dataset: HuggingFace dataset
396
+
397
+ Returns:
398
+ dict: Evaluation results
399
+ """
400
+ # Set up classification pipeline
401
+ classifier = pipeline(
402
+ "text-classification",
403
+ model=model,
404
+ tokenizer=tokenizer,
405
+ device="cpu"
406
+ )
407
+
408
+ # Sample a subset for evaluation
409
+ if len(dataset) > 100:
410
+ dataset = dataset.select(range(100))
411
+
412
+ # Track metrics
413
+ correct = 0
414
+ total = 0
415
+
416
+ # Process each example
417
+ for i, example in enumerate(dataset):
418
+ # Update progress based on completion percentage
419
+ with self.progress_lock:
420
+ self.progress = 40 + int((i / len(dataset)) * 50)
421
+
422
+ text = example.get("text", example.get("sentence", ""))
423
+ label = str(example.get("label", example.get("class", "")))
424
+
425
+ if not text or not label:
426
+ continue
427
+
428
+ # Get model prediction
429
+ result = classifier(text)
430
+ predicted_label = result[0]["label"]
431
+
432
+ # Check if correct
433
+ if str(predicted_label) == label:
434
+ correct += 1
435
+
436
+ total += 1
437
+
438
+ # Calculate metrics
439
+ accuracy = correct / total if total > 0 else 0
440
+
441
+ return {
442
+ "accuracy": accuracy,
443
+ "samples_evaluated": total
444
+ }
445
+
446
+ def _evaluate_code_generation(self, model, tokenizer, dataset):
447
+ """Evaluate a model on code generation tasks.
448
+
449
+ Args:
450
+ model: HuggingFace model
451
+ tokenizer: HuggingFace tokenizer
452
+ dataset: HuggingFace dataset
453
+
454
+ Returns:
455
+ dict: Evaluation results
456
+ """
457
+ # Set up generation pipeline
458
+ generator = pipeline(
459
+ "text-generation",
460
+ model=model,
461
+ tokenizer=tokenizer,
462
+ device="cpu"
463
+ )
464
+
465
+ # Sample a subset for evaluation
466
+ if len(dataset) > 50: # Smaller sample for code tasks
467
+ dataset = dataset.select(range(50))
468
+
469
+ # Track metrics
470
+ exact_matches = 0
471
+ functional_matches = 0
472
+ total = 0
473
+
474
+ # Process each example
475
+ for i, example in enumerate(dataset):
476
+ # Update progress based on completion percentage
477
+ with self.progress_lock:
478
+ self.progress = 40 + int((i / len(dataset)) * 50)
479
+
480
+ prompt = example.get("prompt", example.get("input", ""))
481
+ solution = example.get("solution", example.get("output", ""))
482
+
483
+ if not prompt or not solution:
484
+ continue
485
+
486
+ # Generate code
487
+ generated = generator(
488
+ prompt,
489
+ max_length=200,
490
+ num_return_sequences=1
491
+ )
492
+
493
+ generated_code = generated[0]["generated_text"]
494
+
495
+ # Extract code from generated text (remove prompt)
496
+ if prompt in generated_code:
497
+ generated_code = generated_code[len(prompt):].strip()
498
+
499
+ # Check exact match
500
+ if generated_code.strip() == solution.strip():
501
+ exact_matches += 1
502
+ functional_matches += 1
503
+ else:
504
+ # We would ideally check functional correctness here
505
+ # but that requires executing code which is complex and potentially unsafe
506
+ # For now, we'll use a simple heuristic
507
+ if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
508
+ functional_matches += 0.5 # Partial credit
509
+
510
+ total += 1
511
+
512
+ # Calculate metrics
513
+ exact_match_rate = exact_matches / total if total > 0 else 0
514
+ functional_correctness = functional_matches / total if total > 0 else 0
515
+
516
+ return {
517
+ "exact_match": exact_match_rate,
518
+ "functional_correctness": functional_correctness,
519
+ "samples_evaluated": total
520
+ }
521
+
522
+ def _evaluate_general(self, model, tokenizer, dataset):
523
+ """General evaluation for any dataset type.
524
+
525
+ Args:
526
+ model: HuggingFace model
527
+ tokenizer: HuggingFace tokenizer
528
+ dataset: HuggingFace dataset
529
+
530
+ Returns:
531
+ dict: Evaluation results
532
+ """
533
+ # Set up generation pipeline
534
+ generator = pipeline(
535
+ "text-generation",
536
+ model=model,
537
+ tokenizer=tokenizer,
538
+ device="cpu"
539
+ )
540
+
541
+ # Sample a subset for evaluation
542
+ if len(dataset) > 50:
543
+ dataset = dataset.select(range(50))
544
+
545
+ # Find input and output fields
546
+ features = dataset.features
547
+ input_field = None
548
+ output_field = None
549
+
550
+ for field in features:
551
+ if field.lower() in ["input", "prompt", "question", "text"]:
552
+ input_field = field
553
+ elif field.lower() in ["output", "target", "answer", "response"]:
554
+ output_field = field
555
+
556
+ if not input_field:
557
+ # Just use the first string field as input
558
+ for field in features:
559
+ if isinstance(features[field], (str, list)):
560
+ input_field = field
561
+ break
562
+
563
+ # Track metrics
564
+ total = 0
565
+ generated_texts = []
566
+
567
+ # Process each example
568
+ for i, example in enumerate(dataset):
569
+ # Update progress based on completion percentage
570
+ with self.progress_lock:
571
+ self.progress = 40 + int((i / len(dataset)) * 50)
572
+
573
+ if input_field and input_field in example:
574
+ input_text = str(example[input_field])
575
+
576
+ # Generate text
577
+ generated = generator(
578
+ input_text,
579
+ max_length=100,
580
+ num_return_sequences=1
581
+ )
582
+
583
+ generated_text = generated[0]["generated_text"]
584
+ generated_texts.append({
585
+ "input": input_text,
586
+ "output": generated_text,
587
+ "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
588
+ })
589
+
590
+ total += 1
591
+
592
+ return {
593
+ "samples_evaluated": total,
594
+ "generated_samples": generated_texts[:5] # Include a few samples
595
+ }
596
+
597
+ def _calculate_f1(self, answer, prediction):
598
+ """Calculate F1 score between answer and prediction.
599
+
600
+ Args:
601
+ answer: Ground truth answer
602
+ prediction: Model prediction
603
+
604
+ Returns:
605
+ float: F1 score
606
+ """
607
+ # Tokenize
608
+ answer_tokens = answer.lower().split()
609
+ prediction_tokens = prediction.lower().split()
610
+
611
+ # Calculate precision and recall
612
+ common_tokens = set(answer_tokens) & set(prediction_tokens)
613
+
614
+ if not common_tokens:
615
+ return 0.0
616
+
617
+ precision = len(common_tokens) / len(prediction_tokens)
618
+ recall = len(common_tokens) / len(answer_tokens)
619
+
620
+ # Calculate F1
621
+ if precision + recall == 0:
622
+ return 0.0
623
+
624
+ f1 = 2 * precision * recall / (precision + recall)
625
+ return f1
626
+
627
+ def _calculate_overall_score(self, results):
628
+ """Calculate an overall score from evaluation results.
629
+
630
+ Args:
631
+ results: Evaluation results dictionary
632
+
633
+ Returns:
634
+ float: Overall score between 0 and 100
635
+ """
636
+ score = 0.0
637
+
638
+ # Check for common metrics and weight them
639
+ if "accuracy" in results:
640
+ score += results["accuracy"] * 100
641
+
642
+ if "exact_match" in results:
643
+ score += results["exact_match"] * 100
644
+
645
+ if "f1" in results:
646
+ score += results["f1"] * 100
647
+
648
+ if "functional_correctness" in results:
649
+ score += results["functional_correctness"] * 100
650
+
651
+ # If multiple metrics were found, average them
652
+ num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
653
+
654
+ if num_metrics > 0:
655
+ score /= num_metrics
656
+ else:
657
+ # Default score if no metrics available
658
+ score = 50.0
659
+
660
+ return score
661
+
662
+ def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
663
+ """Submit a model for evaluation on a benchmark.
664
+
665
+ Args:
666
+ model_id: Model ID in the database
667
+ benchmark_id: Benchmark ID in the database
668
+ user_id: User ID submitting the evaluation
669
+ priority: Queue priority (higher = higher priority)
670
+
671
+ Returns:
672
+ int: Evaluation ID if successful, None otherwise
673
+ """
674
+ # Check if user can submit today
675
+ if not self.auth_manager.can_submit_benchmark(user_id):
676
+ return None, "Daily submission limit reached. Try again tomorrow."
677
+
678
+ try:
679
+ # Add evaluation to database and queue
680
+ evaluation_id = self.db_manager.add_evaluation(
681
+ model_id=model_id,
682
+ benchmark_id=benchmark_id,
683
+ priority=priority
684
+ )
685
+
686
+ # Update user's last submission date
687
+ self.auth_manager.update_submission_date(user_id)
688
+
689
+ # Make sure worker is running
690
+ self.start_worker()
691
+
692
+ return evaluation_id, "Evaluation submitted successfully."
693
+ except Exception as e:
694
+ print(f"Submit evaluation error: {e}")
695
+ return None, f"Failed to submit evaluation: {str(e)}"
696
+
697
+ def get_queue_status(self):
698
+ """Get the current status of the evaluation queue.
699
+
700
+ Returns:
701
+ dict: Queue status information
702
+ """
703
+ try:
704
+ # Get evaluations from database
705
+ pending_evals = self.db_manager.get_evaluation_results(status="pending")
706
+ running_evals = self.db_manager.get_evaluation_results(status="running")
707
+ completed_evals = self.db_manager.get_evaluation_results(status="completed")
708
+ failed_evals = self.db_manager.get_evaluation_results(status="failed")
709
+
710
+ # Get current evaluation progress
711
+ current_eval, progress = self.get_current_progress()
712
+
713
+ return {
714
+ "pending": len(pending_evals),
715
+ "running": len(running_evals),
716
+ "completed": len(completed_evals),
717
+ "failed": len(failed_evals),
718
+ "is_processing": self.is_processing,
719
+ "current_evaluation": current_eval,
720
+ "progress": progress
721
+ }
722
+ except Exception as e:
723
+ print(f"Queue status error: {e}")
724
+ return {
725
+ "pending": 0,
726
+ "running": 0,
727
+ "completed": 0,
728
+ "failed": 0,
729
+ "is_processing": self.is_processing,
730
+ "current_evaluation": None,
731
+ "progress": 0,
732
+ "error": str(e)
733
+ }
734
+
735
+ # Model submission UI components
736
+ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
737
+ """Create the model submission UI components.
738
+
739
+ Args:
740
+ evaluation_queue: Evaluation queue instance
741
+ auth_manager: Authentication manager instance
742
+ db_manager: Database manager instance
743
+
744
+ Returns:
745
+ gr.Blocks: Gradio Blocks component with model submission UI
746
+ """
747
+ with gr.Blocks() as submission_ui:
748
+ with gr.Tab("Submit Model"):
749
+ with gr.Row():
750
+ with gr.Column(scale=2):
751
+ model_id_input = gr.Textbox(
752
+ placeholder="HuggingFace model ID (e.g., 'gpt2', 'facebook/opt-350m')",
753
+ label="Model ID"
754
+ )
755
+
756
+ model_name_input = gr.Textbox(
757
+ placeholder="Display name for your model",
758
+ label="Model Name"
759
+ )
760
+
761
+ model_description_input = gr.Textbox(
762
+ placeholder="Brief description of your model",
763
+ label="Description",
764
+ lines=3
765
+ )
766
+
767
+ model_parameters_input = gr.Number(
768
+ label="Number of Parameters (billions)",
769
+ precision=2
770
+ )
771
+
772
+ with gr.Column(scale=1):
773
+ model_tag_input = gr.Dropdown(
774
+ choices=evaluation_queue.model_tags,
775
+ label="Model Tag",
776
+ info="Select one category that best describes your model"
777
+ )
778
+
779
+ benchmark_dropdown = gr.Dropdown(
780
+ label="Benchmark",
781
+ info="Select a benchmark to evaluate your model on"
782
+ )
783
+
784
+ refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
785
+
786
+ submit_model_button = gr.Button("Submit for Evaluation")
787
+ submission_status = gr.Markdown("")
788
+
789
+ with gr.Tab("Evaluation Queue"):
790
+ refresh_queue_button = gr.Button("Refresh Queue")
791
+
792
+ with gr.Row():
793
+ with gr.Column(scale=1):
794
+ queue_stats = gr.JSON(
795
+ label="Queue Statistics"
796
+ )
797
+
798
+ with gr.Column(scale=2):
799
+ queue_status = gr.Dataframe(
800
+ headers=["ID", "Model", "Benchmark", "Status", "Submitted"],
801
+ label="Recent Evaluations"
802
+ )
803
+
804
+ with gr.Row(visible=True) as progress_container:
805
+ with gr.Column():
806
+ current_eval_info = gr.Markdown("No evaluation currently running")
807
+ # Use a simple text display for progress instead of Progress component
808
+ progress_display = gr.Markdown("Progress: 0%")
809
+
810
+ # Function to update progress display
811
+ def update_progress_display():
812
+ current_eval, progress = evaluation_queue.get_current_progress()
813
+
814
+ if current_eval:
815
+ model_info = db_manager.get_model(current_eval['model_id'])
816
+ benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
817
+
818
+ if model_info and benchmark_info:
819
+ eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
820
+ progress_text = f"Progress: {progress}%"
821
+ return eval_info, progress_text
822
+
823
+ return "No evaluation currently running", "Progress: 0%"
824
+
825
+ # Event handlers
826
+ def refresh_benchmarks_handler():
827
+ benchmarks = db_manager.get_benchmarks()
828
+
829
+ # Format for dropdown
830
+ choices = [(b["id"], b["name"]) for b in benchmarks]
831
+
832
+ return gr.update(choices=choices)
833
+
834
+ def submit_model_handler(model_id, model_name, model_description, model_parameters, model_tag, benchmark_id, request: gr.Request):
835
+ # Check if user is logged in
836
+ user = auth_manager.check_login(request)
837
+
838
+ if not user:
839
+ return "Please log in to submit a model."
840
+
841
+ if not model_id or not model_name or not model_tag or not benchmark_id:
842
+ return "Please fill in all required fields."
843
+
844
+ try:
845
+ # Add model to database
846
+ model_db_id = db_manager.add_model(
847
+ name=model_name,
848
+ hf_model_id=model_id,
849
+ user_id=user["id"],
850
+ tag=model_tag,
851
+ parameters=str(model_parameters) if model_parameters else None,
852
+ description=model_description
853
+ )
854
+
855
+ if not model_db_id:
856
+ return "Failed to add model to database."
857
+
858
+ # Submit for evaluation
859
+ eval_id, message = evaluation_queue.submit_evaluation(
860
+ model_id=model_db_id,
861
+ benchmark_id=benchmark_id,
862
+ user_id=user["id"]
863
+ )
864
+
865
+ if eval_id:
866
+ return f"Model submitted successfully. Evaluation ID: {eval_id}"
867
+ else:
868
+ return message
869
+ except Exception as e:
870
+ return f"Error submitting model: {str(e)}"
871
+
872
+ def refresh_queue_handler():
873
+ # Get queue statistics
874
+ stats = evaluation_queue.get_queue_status()
875
+
876
+ # Get recent evaluations
877
+ evals = db_manager.get_evaluation_results(limit=20)
878
+
879
+ # Format for dataframe
880
+ eval_data = []
881
+ for eval in evals:
882
+ eval_data.append([
883
+ eval["id"],
884
+ eval["model_name"],
885
+ eval["benchmark_name"],
886
+ eval["status"],
887
+ eval["submitted_at"]
888
+ ])
889
+
890
+ # Also update progress display
891
+ current_eval, progress = evaluation_queue.get_current_progress()
892
+ if current_eval:
893
+ model_info = db_manager.get_model(current_eval['model_id'])
894
+ benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
895
+
896
+ if model_info and benchmark_info:
897
+ eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
898
+ progress_text = f"Progress: {progress}%"
899
+ return stats, eval_data, eval_info, progress_text
900
+
901
+ return stats, eval_data, "No evaluation currently running", "Progress: 0%"
902
+
903
+ # Connect event handlers
904
+ refresh_benchmarks_button.click(
905
+ fn=refresh_benchmarks_handler,
906
+ inputs=[],
907
+ outputs=[benchmark_dropdown]
908
+ )
909
+
910
+ submit_model_button.click(
911
+ fn=submit_model_handler,
912
+ inputs=[
913
+ model_id_input,
914
+ model_name_input,
915
+ model_description_input,
916
+ model_parameters_input,
917
+ model_tag_input,
918
+ benchmark_dropdown
919
+ ],
920
+ outputs=[submission_status]
921
+ )
922
+
923
+ refresh_queue_button.click(
924
+ fn=refresh_queue_handler,
925
+ inputs=[],
926
+ outputs=[queue_stats, queue_status, current_eval_info, progress_display]
927
+ )
928
+
929
+ # Initialize on load
930
+ submission_ui.load(
931
+ fn=refresh_benchmarks_handler,
932
+ inputs=[],
933
+ outputs=[benchmark_dropdown]
934
+ )
935
+
936
+ submission_ui.load(
937
+ fn=refresh_queue_handler,
938
+ inputs=[],
939
+ outputs=[queue_stats, queue_status, current_eval_info, progress_display]
940
+ )
941
+
942
+ # Manual refresh button with instructions
943
+ gr.Markdown("""
944
+ **Note:** Click the 'Refresh Queue' button periodically to update the progress display.
945
+ """)
946
+
947
+ return submission_ui
leaderboard.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Leaderboard module for Dynamic Highscores system.
3
+
4
+ This module implements the unified leaderboard with tag-based filtering
5
+ for displaying all evaluated models.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import pandas as pd
11
+ import gradio as gr
12
+ import plotly.express as px
13
+ import plotly.graph_objects as go
14
+
15
+ class Leaderboard:
16
+ """Manages the unified leaderboard with filtering capabilities."""
17
+
18
+ def __init__(self, db_manager):
19
+ """Initialize the leaderboard manager.
20
+
21
+ Args:
22
+ db_manager: Database manager instance
23
+ """
24
+ self.db_manager = db_manager
25
+ self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
26
+
27
+ # Define color scheme for tags
28
+ self.tag_colors = {
29
+ "Merge": "#FF6B6B",
30
+ "Agent": "#4ECDC4",
31
+ "Reasoning": "#FFD166",
32
+ "Coding": "#6B5B95",
33
+ "General": "#88D8B0",
34
+ "Specialized": "#FF8C42",
35
+ "Instruction": "#5D9CEC",
36
+ "Chat": "#AC92EB"
37
+ }
38
+
39
+ def get_leaderboard_data(self, tag=None, benchmark_id=None):
40
+ """Get leaderboard data, optionally filtered by tag or benchmark.
41
+
42
+ Args:
43
+ tag: Model tag to filter by (None for all)
44
+ benchmark_id: Benchmark ID to filter by (None for all)
45
+
46
+ Returns:
47
+ pd.DataFrame: Leaderboard data
48
+ """
49
+ # Get evaluation results from database
50
+ if tag and tag != "All":
51
+ df = self.db_manager.get_leaderboard_df(tag=tag)
52
+ else:
53
+ df = self.db_manager.get_leaderboard_df()
54
+
55
+ # Filter by benchmark if specified
56
+ if benchmark_id and not df.empty:
57
+ df = df[df['benchmark_id'] == benchmark_id]
58
+
59
+ return df
60
+
61
+ def format_leaderboard_for_display(self, df):
62
+ """Format leaderboard data for display.
63
+
64
+ Args:
65
+ df: Leaderboard DataFrame
66
+
67
+ Returns:
68
+ pd.DataFrame: Formatted leaderboard for display
69
+ """
70
+ if df.empty:
71
+ return pd.DataFrame()
72
+
73
+ # Select and rename columns for display
74
+ display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
75
+ display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
76
+
77
+ # Round score to 2 decimal places
78
+ display_df['Score'] = display_df['Score'].round(2)
79
+
80
+ # Sort by score (descending)
81
+ display_df = display_df.sort_values('Score', ascending=False)
82
+
83
+ return display_df
84
+
85
+ def create_performance_chart(self, df, chart_type="bar"):
86
+ """Create a performance chart from leaderboard data.
87
+
88
+ Args:
89
+ df: Leaderboard DataFrame
90
+ chart_type: Type of chart to create ("bar" or "scatter")
91
+
92
+ Returns:
93
+ plotly.graph_objects.Figure: Performance chart
94
+ """
95
+ if df.empty:
96
+ # Return empty figure
97
+ fig = go.Figure()
98
+ fig.update_layout(
99
+ title="No data available",
100
+ xaxis_title="Model",
101
+ yaxis_title="Score"
102
+ )
103
+ return fig
104
+
105
+ # Prepare data for visualization
106
+ plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
107
+ plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
108
+
109
+ # Create chart based on type
110
+ if chart_type == "scatter":
111
+ fig = px.scatter(
112
+ plot_df,
113
+ x="Model",
114
+ y="Score",
115
+ color="Tag",
116
+ symbol="Benchmark",
117
+ size="Score",
118
+ hover_data=["Model", "Benchmark", "Score"],
119
+ color_discrete_map=self.tag_colors
120
+ )
121
+ else: # Default to bar chart
122
+ fig = px.bar(
123
+ plot_df,
124
+ x="Model",
125
+ y="Score",
126
+ color="Tag",
127
+ barmode="group",
128
+ hover_data=["Model", "Benchmark", "Score"],
129
+ color_discrete_map=self.tag_colors
130
+ )
131
+
132
+ # Customize layout
133
+ fig.update_layout(
134
+ title="Model Performance Comparison",
135
+ xaxis_title="Model",
136
+ yaxis_title="Score",
137
+ legend_title="Tag",
138
+ font=dict(size=12)
139
+ )
140
+
141
+ return fig
142
+
143
+ def create_tag_distribution_chart(self, df):
144
+ """Create a chart showing distribution of models by tag.
145
+
146
+ Args:
147
+ df: Leaderboard DataFrame
148
+
149
+ Returns:
150
+ plotly.graph_objects.Figure: Tag distribution chart
151
+ """
152
+ if df.empty:
153
+ # Return empty figure
154
+ fig = go.Figure()
155
+ fig.update_layout(
156
+ title="No data available",
157
+ xaxis_title="Tag",
158
+ yaxis_title="Count"
159
+ )
160
+ return fig
161
+
162
+ # Count models by tag
163
+ tag_counts = df['tag'].value_counts().reset_index()
164
+ tag_counts.columns = ['Tag', 'Count']
165
+
166
+ # Create pie chart
167
+ fig = px.pie(
168
+ tag_counts,
169
+ names='Tag',
170
+ values='Count',
171
+ title='Model Distribution by Tag',
172
+ color='Tag',
173
+ color_discrete_map=self.tag_colors
174
+ )
175
+
176
+ # Customize layout
177
+ fig.update_layout(
178
+ font=dict(size=12)
179
+ )
180
+
181
+ return fig
182
+
183
+ def create_benchmark_comparison_chart(self, df):
184
+ """Create a chart comparing performance across benchmarks.
185
+
186
+ Args:
187
+ df: Leaderboard DataFrame
188
+
189
+ Returns:
190
+ plotly.graph_objects.Figure: Benchmark comparison chart
191
+ """
192
+ if df.empty:
193
+ # Return empty figure
194
+ fig = go.Figure()
195
+ fig.update_layout(
196
+ title="No data available",
197
+ xaxis_title="Benchmark",
198
+ yaxis_title="Average Score"
199
+ )
200
+ return fig
201
+
202
+ # Calculate average score by benchmark
203
+ benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
204
+ benchmark_avg.columns = ['Benchmark', 'Average Score']
205
+
206
+ # Create bar chart
207
+ fig = px.bar(
208
+ benchmark_avg,
209
+ x='Benchmark',
210
+ y='Average Score',
211
+ title='Average Performance by Benchmark',
212
+ color='Benchmark'
213
+ )
214
+
215
+ # Customize layout
216
+ fig.update_layout(
217
+ xaxis_title="Benchmark",
218
+ yaxis_title="Average Score",
219
+ font=dict(size=12)
220
+ )
221
+
222
+ return fig
223
+
224
+ # Leaderboard UI components
225
+ def create_leaderboard_ui(leaderboard, db_manager):
226
+ """Create the leaderboard UI components.
227
+
228
+ Args:
229
+ leaderboard: Leaderboard instance
230
+ db_manager: Database manager instance
231
+
232
+ Returns:
233
+ gr.Blocks: Gradio Blocks component with leaderboard UI
234
+ """
235
+ with gr.Blocks() as leaderboard_ui:
236
+ gr.Markdown("# Dynamic Highscores Leaderboard")
237
+
238
+ with gr.Row():
239
+ with gr.Column(scale=1):
240
+ tag_filter = gr.Dropdown(
241
+ choices=leaderboard.model_tags,
242
+ value="All",
243
+ label="Filter by Tag"
244
+ )
245
+
246
+ benchmark_filter = gr.Dropdown(
247
+ choices=[("all", "All Benchmarks")],
248
+ value="all",
249
+ label="Filter by Benchmark"
250
+ )
251
+
252
+ refresh_button = gr.Button("Refresh Leaderboard")
253
+
254
+ with gr.Column(scale=2):
255
+ chart_type = gr.Radio(
256
+ choices=["bar", "scatter"],
257
+ value="bar",
258
+ label="Chart Type"
259
+ )
260
+
261
+ view_type = gr.Radio(
262
+ choices=["Table", "Chart", "Dashboard"],
263
+ value="Table",
264
+ label="View Type"
265
+ )
266
+
267
+ # Table view
268
+ leaderboard_table = gr.Dataframe(
269
+ headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
270
+ label="Leaderboard",
271
+ visible=True
272
+ )
273
+
274
+ # Chart view
275
+ with gr.Row(visible=False) as chart_view:
276
+ performance_chart = gr.Plot(label="Performance Chart")
277
+
278
+ # Dashboard view
279
+ with gr.Row(visible=False) as dashboard_view:
280
+ with gr.Column(scale=2):
281
+ dashboard_performance_chart = gr.Plot(label="Performance Comparison")
282
+
283
+ with gr.Column(scale=1):
284
+ with gr.Row():
285
+ tag_distribution_chart = gr.Plot(label="Model Distribution")
286
+
287
+ with gr.Row():
288
+ benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
289
+
290
+ # Event handlers
291
+ def refresh_benchmarks():
292
+ benchmarks = db_manager.get_benchmarks()
293
+
294
+ # Format for dropdown
295
+ choices = [("all", "All Benchmarks")]
296
+ choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
297
+
298
+ return gr.update(choices=choices)
299
+
300
+ def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
301
+ # Get leaderboard data
302
+ if benchmark_id == "all":
303
+ benchmark_id = None
304
+ else:
305
+ benchmark_id = int(benchmark_id)
306
+
307
+ df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
308
+
309
+ # Format for display
310
+ display_df = leaderboard.format_leaderboard_for_display(df)
311
+
312
+ # Create charts
313
+ perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
314
+ tag_chart = leaderboard.create_tag_distribution_chart(df)
315
+ benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
316
+
317
+ # Update visibility based on view type
318
+ table_visible = view_type_val == "Table"
319
+ chart_visible = view_type_val == "Chart"
320
+ dashboard_visible = view_type_val == "Dashboard"
321
+
322
+ return (
323
+ display_df,
324
+ perf_chart,
325
+ perf_chart, # Same chart for both views
326
+ tag_chart,
327
+ benchmark_chart,
328
+ gr.update(visible=table_visible),
329
+ gr.update(visible=chart_visible),
330
+ gr.update(visible=dashboard_visible)
331
+ )
332
+
333
+ # Connect event handlers
334
+ refresh_button.click(
335
+ fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
336
+ inputs=[tag_filter, benchmark_filter, chart_type, view_type],
337
+ outputs=[
338
+ leaderboard_table,
339
+ performance_chart,
340
+ dashboard_performance_chart,
341
+ tag_distribution_chart,
342
+ benchmark_comparison_chart,
343
+ leaderboard_table,
344
+ chart_view,
345
+ dashboard_view
346
+ ]
347
+ )
348
+
349
+ view_type.change(
350
+ fn=lambda view_t: (
351
+ gr.update(visible=view_t == "Table"),
352
+ gr.update(visible=view_t == "Chart"),
353
+ gr.update(visible=view_t == "Dashboard")
354
+ ),
355
+ inputs=[view_type],
356
+ outputs=[leaderboard_table, chart_view, dashboard_view]
357
+ )
358
+
359
+ # Initialize on load
360
+ leaderboard_ui.load(
361
+ fn=refresh_benchmarks,
362
+ inputs=[],
363
+ outputs=[benchmark_filter]
364
+ )
365
+
366
+ leaderboard_ui.load(
367
+ fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
368
+ inputs=[],
369
+ outputs=[
370
+ leaderboard_table,
371
+ performance_chart,
372
+ dashboard_performance_chart,
373
+ tag_distribution_chart,
374
+ benchmark_comparison_chart,
375
+ leaderboard_table,
376
+ chart_view,
377
+ dashboard_view
378
+ ]
379
+ )
380
+
381
+ return leaderboard_ui