Quazim0t0 commited on
Commit
5be33ac
·
verified ·
1 Parent(s): d45815a

Upload 12 files

Browse files
Files changed (12) hide show
  1. README.md +14 -8
  2. app.py +115 -0
  3. auth.py +309 -0
  4. benchmark_selection.py +511 -0
  5. database_schema.py +393 -0
  6. evaluation_queue.py +964 -0
  7. leaderboard.py +381 -0
  8. requirements.txt +12 -0
  9. sample_benchmarks.py +66 -0
  10. space.yaml +23 -0
  11. test_app.py +237 -0
  12. todo.md +48 -0
README.md CHANGED
@@ -1,12 +1,18 @@
1
  ---
2
- title: QL
3
- emoji: 🌍
4
  colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.22.0
8
  app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
+ title: qLeaderboard
3
+ emoji: 🥇
4
  colorFrom: red
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.36.0
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ duplicated_from: logikon/open_cot_leaderboard
12
+ fullWidth: true
13
+ tags:
14
+ - leaderboard
15
+ - CoT
16
+ - chain-of-thought
17
+ short_description: Track, rank and evaluate on qLeaderboard
18
+ ---
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main application for Dynamic Highscores system.
3
+
4
+ This file integrates all components into a unified application.
5
+ """
6
+
7
+ import os
8
+ import gradio as gr
9
+ import threading
10
+ import queue
11
+ from database_schema import init_db
12
+ from auth import HuggingFaceAuth, create_login_ui, setup_auth_handlers
13
+ from benchmark_selection import BenchmarkSelector, create_benchmark_selection_ui
14
+ from evaluation_queue import EvaluationQueue, create_model_submission_ui
15
+ from leaderboard import Leaderboard, create_leaderboard_ui
16
+ from sample_benchmarks import add_sample_benchmarks
17
+
18
+ # Initialize database
19
+ db = init_db()
20
+
21
+ # Initialize components
22
+ auth_manager = HuggingFaceAuth(db)
23
+ benchmark_selector = BenchmarkSelector(db, auth_manager)
24
+ evaluation_queue = EvaluationQueue(db, auth_manager)
25
+ leaderboard = Leaderboard(db)
26
+
27
+ # Initialize sample benchmarks if none exist
28
+ benchmarks = db.get_benchmarks()
29
+ if not benchmarks or len(benchmarks) == 0:
30
+ print("No benchmarks found. Adding sample benchmarks...")
31
+ num_added = add_sample_benchmarks()
32
+ print(f"Added {num_added} sample benchmarks.")
33
+
34
+ # Custom CSS
35
+ css = """
36
+ .info-text {
37
+ background-color: #f0f7ff;
38
+ padding: 12px;
39
+ border-radius: 8px;
40
+ border-left: 4px solid #3498db;
41
+ margin: 12px 0;
42
+ }
43
+
44
+ .container {
45
+ max-width: 1200px;
46
+ margin: 0 auto;
47
+ }
48
+
49
+ .header {
50
+ text-align: center;
51
+ margin-bottom: 20px;
52
+ }
53
+
54
+ .footer {
55
+ text-align: center;
56
+ margin-top: 40px;
57
+ padding: 20px;
58
+ border-top: 1px solid #eee;
59
+ }
60
+ """
61
+
62
+ # Create Gradio app
63
+ with gr.Blocks(css=css, title="Dynamic Highscores") as app:
64
+ gr.Markdown("# 🏆 Dynamic Highscores", elem_classes=["header"])
65
+ gr.Markdown("""
66
+ Welcome to Dynamic Highscores - a community benchmark platform for evaluating and comparing language models.
67
+
68
+ - **Add your own benchmarks** from HuggingFace datasets
69
+ - **Submit your models** for CPU-only evaluation
70
+ - **Compare performance** across different models and benchmarks
71
+ - **Filter results** by model type (Merge, Agent, Reasoning, Coding, etc.)
72
+ """, elem_classes=["info-text"])
73
+
74
+ # Authentication UI
75
+ login_button, logout_button, token_input, user_info = create_login_ui()
76
+ setup_auth_handlers(login_button, logout_button, token_input, user_info, auth_manager)
77
+
78
+ # Main tabs
79
+ with gr.Tabs() as tabs:
80
+ with gr.TabItem("📊 Leaderboard", id=0):
81
+ # Fix: Pass db_manager parameter to create_leaderboard_ui
82
+ leaderboard_ui = create_leaderboard_ui(leaderboard, db)
83
+
84
+ with gr.TabItem("🚀 Submit Model", id=1):
85
+ submission_ui = create_model_submission_ui(evaluation_queue, auth_manager, db)
86
+
87
+ with gr.TabItem("🔍 Benchmarks", id=2):
88
+ benchmark_ui = create_benchmark_selection_ui(benchmark_selector, auth_manager)
89
+
90
+ gr.Markdown("""
91
+ ### About Dynamic Highscores
92
+
93
+ This platform allows users to select benchmarks from HuggingFace datasets and evaluate models against them.
94
+ Each user can submit one benchmark per day (admin users are exempt from this limit).
95
+ All evaluations run on CPU only to ensure fair comparisons.
96
+
97
+ Created by Quazim0t0
98
+ """, elem_classes=["footer"])
99
+
100
+ # Start evaluation queue worker after app is defined
101
+ # This prevents the worker from starting before the app is fully initialized
102
+ def start_queue_worker():
103
+ # Wait a moment to ensure app is initialized
104
+ import time
105
+ time.sleep(2)
106
+ evaluation_queue.start_worker()
107
+
108
+ # Launch the app
109
+ if __name__ == "__main__":
110
+ # Start queue worker in a separate thread to avoid SQLite thread issues
111
+ queue_thread = threading.Thread(target=start_queue_worker)
112
+ queue_thread.daemon = True
113
+ queue_thread.start()
114
+
115
+ app.launch()
auth.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Authentication module for Dynamic Highscores system.
3
+
4
+ This module handles user authentication with HuggingFace,
5
+ user session management, and access control.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import time
11
+ import requests
12
+ import gradio as gr
13
+ from huggingface_hub import HfApi, login
14
+ from functools import wraps
15
+
16
+ class HuggingFaceAuth:
17
+ """Authentication manager for HuggingFace integration."""
18
+
19
+ def __init__(self, db_manager):
20
+ """Initialize the authentication manager.
21
+
22
+ Args:
23
+ db_manager: Database manager instance for user storage
24
+ """
25
+ self.db_manager = db_manager
26
+ self.hf_api = HfApi()
27
+ self.admin_username = os.environ.get("ADMIN_USERNAME", "Quazim0t0")
28
+
29
+ def login_user(self, token):
30
+ """Log in a user with their HuggingFace token.
31
+
32
+ Args:
33
+ token: HuggingFace API token
34
+
35
+ Returns:
36
+ dict: User information if login successful, None otherwise
37
+ """
38
+ try:
39
+ # Validate token with HuggingFace
40
+ login(token=token, add_to_git_credential=False)
41
+
42
+ # Get user info from HuggingFace
43
+ user_info = self.hf_api.whoami(token=token)
44
+
45
+ if not user_info:
46
+ return None
47
+
48
+ # Check if user exists in our database, create if not
49
+ username = user_info.get("name", user_info.get("fullname", ""))
50
+ hf_user_id = user_info.get("id", "")
51
+
52
+ if not hf_user_id:
53
+ return None
54
+
55
+ # Check if this is the admin account
56
+ is_admin = (username == self.admin_username)
57
+
58
+ # Add or get user from database
59
+ user_id = self.db_manager.add_user(username, hf_user_id, is_admin)
60
+
61
+ # Get complete user info from database
62
+ user = self.db_manager.get_user(hf_user_id)
63
+
64
+ if user:
65
+ # Add token to user info for session only (not stored in database)
66
+ user['token'] = token
67
+ return user
68
+
69
+ return None
70
+ except Exception as e:
71
+ print(f"Login error: {e}")
72
+ return None
73
+
74
+ def check_login(self, request: gr.Request):
75
+ """Check if a user is logged in from a Gradio request.
76
+
77
+ Args:
78
+ request: Gradio request object
79
+
80
+ Returns:
81
+ dict: User information if logged in, None otherwise
82
+ """
83
+ if not request:
84
+ return None
85
+
86
+ # Get token from cookies
87
+ token = request.cookies.get("hf_token")
88
+
89
+ if not token:
90
+ return None
91
+
92
+ try:
93
+ # Validate token with HuggingFace
94
+ user_info = self.hf_api.whoami(token=token)
95
+
96
+ if not user_info:
97
+ return None
98
+
99
+ # Get user from database
100
+ hf_user_id = user_info.get("id", "")
101
+ user = self.db_manager.get_user(hf_user_id)
102
+
103
+ if user:
104
+ # Add token to user info for session only (not stored in database)
105
+ user['token'] = token
106
+ return user
107
+
108
+ return None
109
+ except Exception as e:
110
+ print(f"Check login error: {e}")
111
+ return None
112
+
113
+ def require_login(self, func):
114
+ """Decorator to require login for a function.
115
+
116
+ Args:
117
+ func: Function to decorate
118
+
119
+ Returns:
120
+ Function: Decorated function that requires login
121
+ """
122
+ @wraps(func)
123
+ def wrapper(*args, **kwargs):
124
+ # Find the request argument
125
+ request = None
126
+ for arg in args:
127
+ if isinstance(arg, gr.Request):
128
+ request = arg
129
+ break
130
+
131
+ if not request and 'request' in kwargs:
132
+ request = kwargs['request']
133
+
134
+ if not request:
135
+ return "Please log in to access this feature."
136
+
137
+ # Check if user is logged in
138
+ user = self.check_login(request)
139
+
140
+ if not user:
141
+ return "Please log in to access this feature."
142
+
143
+ # Add user to kwargs
144
+ kwargs['user'] = user
145
+
146
+ # Call the original function
147
+ return func(*args, **kwargs)
148
+
149
+ return wrapper
150
+
151
+ def require_admin(self, func):
152
+ """Decorator to require admin privileges for a function.
153
+
154
+ Args:
155
+ func: Function to decorate
156
+
157
+ Returns:
158
+ Function: Decorated function that requires admin privileges
159
+ """
160
+ @wraps(func)
161
+ def wrapper(*args, **kwargs):
162
+ # Find the request argument
163
+ request = None
164
+ for arg in args:
165
+ if isinstance(arg, gr.Request):
166
+ request = arg
167
+ break
168
+
169
+ if not request and 'request' in kwargs:
170
+ request = kwargs['request']
171
+
172
+ if not request:
173
+ return "Admin access required."
174
+
175
+ # Check if user is logged in
176
+ user = self.check_login(request)
177
+
178
+ if not user:
179
+ return "Admin access required."
180
+
181
+ # Check if user is admin
182
+ if not user.get('is_admin', False):
183
+ return "Admin access required."
184
+
185
+ # Add user to kwargs
186
+ kwargs['user'] = user
187
+
188
+ # Call the original function
189
+ return func(*args, **kwargs)
190
+
191
+ return wrapper
192
+
193
+ def can_submit_benchmark(self, user_id):
194
+ """Check if a user can submit a benchmark today.
195
+
196
+ Args:
197
+ user_id: User ID to check
198
+
199
+ Returns:
200
+ bool: True if user can submit, False otherwise
201
+ """
202
+ return self.db_manager.can_submit_today(user_id)
203
+
204
+ def update_submission_date(self, user_id):
205
+ """Update the last submission date for a user.
206
+
207
+ Args:
208
+ user_id: User ID to update
209
+ """
210
+ self.db_manager.update_submission_date(user_id)
211
+
212
+ # Authentication UI components
213
+ def create_login_ui():
214
+ """Create the login UI components.
215
+
216
+ Returns:
217
+ tuple: (login_button, logout_button, token_input, user_info)
218
+ """
219
+ with gr.Row():
220
+ with gr.Column(scale=3):
221
+ token_input = gr.Textbox(
222
+ placeholder="Enter your HuggingFace token",
223
+ label="HuggingFace Token",
224
+ type="password",
225
+ visible=True,
226
+ info="Your token is only stored temporarily in browser session cookies and is never saved permanently"
227
+ )
228
+ login_button = gr.Button("Login")
229
+ logout_button = gr.Button("Logout", visible=False)
230
+
231
+ with gr.Column(scale=2):
232
+ user_info = gr.Markdown("Not logged in")
233
+
234
+ return login_button, logout_button, token_input, user_info
235
+
236
+ def login_handler(token, auth_manager):
237
+ """Handle login button click.
238
+
239
+ Args:
240
+ token: HuggingFace token
241
+ auth_manager: Authentication manager instance
242
+
243
+ Returns:
244
+ tuple: Updated UI components visibility and user info
245
+ """
246
+ if not token:
247
+ return gr.update(visible=True), gr.update(visible=False), "Please enter your HuggingFace token"
248
+
249
+ user = auth_manager.login_user(token)
250
+
251
+ if user:
252
+ # Set cookie in JavaScript with session-only flag (no persistent storage)
253
+ # Cookie will expire when browser is closed
254
+ js = f"""
255
+ document.cookie = "hf_token={token}; path=/; SameSite=Strict";
256
+ """
257
+
258
+ # Return updated UI components
259
+ return (
260
+ gr.update(visible=False), # Hide token input
261
+ gr.update(visible=True), # Show logout button
262
+ f"Logged in as {user['username']}" # Update user info
263
+ )
264
+ else:
265
+ return (
266
+ gr.update(visible=True), # Keep token input visible
267
+ gr.update(visible=False), # Hide logout button
268
+ "Login failed. Please check your token and try again." # Update user info
269
+ )
270
+
271
+ def logout_handler():
272
+ """Handle logout button click.
273
+
274
+ Returns:
275
+ tuple: Updated UI components visibility and user info
276
+ """
277
+ # Clear cookie in JavaScript
278
+ js = """
279
+ document.cookie = "hf_token=; path=/; max-age=0; SameSite=Strict";
280
+ """
281
+
282
+ # Return updated UI components
283
+ return (
284
+ gr.update(visible=True), # Show token input
285
+ gr.update(visible=False), # Hide logout button
286
+ "Logged out" # Update user info
287
+ )
288
+
289
+ def setup_auth_handlers(login_button, logout_button, token_input, user_info, auth_manager):
290
+ """Set up event handlers for authentication UI components.
291
+
292
+ Args:
293
+ login_button: Login button component
294
+ logout_button: Logout button component
295
+ token_input: Token input component
296
+ user_info: User info component
297
+ auth_manager: Authentication manager instance
298
+ """
299
+ login_button.click(
300
+ fn=lambda token: login_handler(token, auth_manager),
301
+ inputs=[token_input],
302
+ outputs=[token_input, logout_button, user_info]
303
+ )
304
+
305
+ logout_button.click(
306
+ fn=logout_handler,
307
+ inputs=[],
308
+ outputs=[token_input, logout_button, user_info]
309
+ )
benchmark_selection.py ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark selection module for Dynamic Highscores system.
3
+
4
+ This module handles browsing, selection, and loading of HuggingFace datasets
5
+ to be used as benchmarks for model evaluation.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import gradio as gr
11
+ from huggingface_hub import HfApi, list_datasets
12
+ from datasets import load_dataset, get_dataset_config_names
13
+ from functools import partial
14
+
15
+ class BenchmarkSelector:
16
+ """Benchmark selection manager for HuggingFace datasets."""
17
+
18
+ def __init__(self, db_manager, auth_manager):
19
+ """Initialize the benchmark selector.
20
+
21
+ Args:
22
+ db_manager: Database manager instance for benchmark storage
23
+ auth_manager: Authentication manager instance for access control
24
+ """
25
+ self.db_manager = db_manager
26
+ self.auth_manager = auth_manager
27
+ self.hf_api = HfApi()
28
+
29
+ # Common benchmark categories for filtering
30
+ self.categories = [
31
+ "All",
32
+ "Text Generation",
33
+ "Question Answering",
34
+ "Summarization",
35
+ "Translation",
36
+ "Classification",
37
+ "Code Generation",
38
+ "Reasoning",
39
+ "Math"
40
+ ]
41
+
42
+ # Common metrics for different benchmark types
43
+ self.metric_templates = {
44
+ "Text Generation": ["bleu", "rouge", "meteor"],
45
+ "Question Answering": ["exact_match", "f1"],
46
+ "Summarization": ["rouge1", "rouge2", "rougeL"],
47
+ "Translation": ["bleu", "ter"],
48
+ "Classification": ["accuracy", "f1", "precision", "recall"],
49
+ "Code Generation": ["exact_match", "pass@k", "functional_correctness"],
50
+ "Reasoning": ["accuracy", "consistency"],
51
+ "Math": ["accuracy", "correct_steps"]
52
+ }
53
+
54
+ def search_datasets(self, query, category="All", limit=50):
55
+ """Search for datasets on HuggingFace.
56
+
57
+ Args:
58
+ query: Search query string
59
+ category: Dataset category to filter by
60
+ limit: Maximum number of results to return
61
+
62
+ Returns:
63
+ list: List of dataset information dictionaries
64
+ """
65
+ try:
66
+ # Apply category filter if not "All"
67
+ filter_str = None
68
+ if category != "All":
69
+ filter_str = f"task_categories:{category}"
70
+
71
+ # Search for datasets
72
+ datasets = list_datasets(
73
+ search=query,
74
+ filter=filter_str,
75
+ limit=limit
76
+ )
77
+
78
+ # Format results
79
+ results = []
80
+ for dataset in datasets:
81
+ results.append({
82
+ "id": dataset.id,
83
+ "name": dataset.id.split("/")[-1],
84
+ "author": dataset.author,
85
+ "description": dataset.description[:200] + "..." if dataset.description and len(dataset.description) > 200 else dataset.description,
86
+ "tags": dataset.tags,
87
+ "downloads": dataset.downloads
88
+ })
89
+
90
+ return results
91
+ except Exception as e:
92
+ print(f"Dataset search error: {e}")
93
+ return []
94
+
95
+ def get_dataset_info(self, dataset_id):
96
+ """Get detailed information about a dataset.
97
+
98
+ Args:
99
+ dataset_id: HuggingFace dataset ID
100
+
101
+ Returns:
102
+ dict: Dataset information
103
+ """
104
+ try:
105
+ # Get dataset info from HuggingFace
106
+ dataset_info = self.hf_api.dataset_info(dataset_id)
107
+
108
+ # Get available configurations
109
+ configs = get_dataset_config_names(dataset_id)
110
+
111
+ # Format result
112
+ result = {
113
+ "id": dataset_info.id,
114
+ "name": dataset_info.id.split("/")[-1],
115
+ "author": dataset_info.author,
116
+ "description": dataset_info.description,
117
+ "citation": dataset_info.citation,
118
+ "configs": configs,
119
+ "tags": dataset_info.tags,
120
+ "downloads": dataset_info.downloads
121
+ }
122
+
123
+ return result
124
+ except Exception as e:
125
+ print(f"Dataset info error: {e}")
126
+ return None
127
+
128
+ def load_dataset_sample(self, dataset_id, config=None, split="train", sample_size=5):
129
+ """Load a sample from a dataset.
130
+
131
+ Args:
132
+ dataset_id: HuggingFace dataset ID
133
+ config: Dataset configuration name
134
+ split: Dataset split to sample from
135
+ sample_size: Number of samples to load
136
+
137
+ Returns:
138
+ dict: Dataset sample information
139
+ """
140
+ try:
141
+ # Load dataset
142
+ if config:
143
+ dataset = load_dataset(dataset_id, config, split=split)
144
+ else:
145
+ dataset = load_dataset(dataset_id, split=split)
146
+
147
+ # Get sample
148
+ if len(dataset) > sample_size:
149
+ sample = dataset.select(range(sample_size))
150
+ else:
151
+ sample = dataset
152
+
153
+ # Get features
154
+ features = list(sample.features.keys())
155
+
156
+ # Convert sample to list of dictionaries
157
+ sample_data = []
158
+ for item in sample:
159
+ sample_item = {}
160
+ for key in features:
161
+ # Convert non-serializable values to strings
162
+ if isinstance(item[key], (list, dict)):
163
+ sample_item[key] = str(item[key])
164
+ else:
165
+ sample_item[key] = item[key]
166
+ sample_data.append(sample_item)
167
+
168
+ # Format result
169
+ result = {
170
+ "id": dataset_id,
171
+ "config": config,
172
+ "split": split,
173
+ "features": features,
174
+ "sample": sample_data,
175
+ "total_size": len(dataset)
176
+ }
177
+
178
+ return result
179
+ except Exception as e:
180
+ print(f"Dataset sample error: {e}")
181
+ return None
182
+
183
+ def add_benchmark(self, dataset_id, name=None, description=None, metrics=None, config=None):
184
+ """Add a dataset as a benchmark.
185
+
186
+ Args:
187
+ dataset_id: HuggingFace dataset ID
188
+ name: Benchmark name (defaults to dataset name)
189
+ description: Benchmark description (defaults to dataset description)
190
+ metrics: Metrics to use for evaluation
191
+ config: Dataset configuration to use
192
+
193
+ Returns:
194
+ int: Benchmark ID if successful, None otherwise
195
+ """
196
+ try:
197
+ # Get dataset info if name or description not provided
198
+ if not name or not description:
199
+ dataset_info = self.get_dataset_info(dataset_id)
200
+ if not dataset_info:
201
+ return None
202
+
203
+ if not name:
204
+ name = dataset_info["name"]
205
+
206
+ if not description:
207
+ description = dataset_info["description"]
208
+
209
+ # Format dataset ID with config if provided
210
+ full_dataset_id = dataset_id
211
+ if config:
212
+ full_dataset_id = f"{dataset_id}:{config}"
213
+
214
+ # Add benchmark to database
215
+ benchmark_id = self.db_manager.add_benchmark(
216
+ name=name,
217
+ dataset_id=full_dataset_id,
218
+ description=description,
219
+ metrics=metrics
220
+ )
221
+
222
+ return benchmark_id
223
+ except Exception as e:
224
+ print(f"Add benchmark error: {e}")
225
+ return None
226
+
227
+ def get_benchmarks(self):
228
+ """Get all available benchmarks.
229
+
230
+ Returns:
231
+ list: List of benchmark information dictionaries
232
+ """
233
+ return self.db_manager.get_benchmarks()
234
+
235
+ # Benchmark selection UI components
236
+ def create_benchmark_selection_ui(benchmark_selector, auth_manager):
237
+ """Create the benchmark selection UI components.
238
+
239
+ Args:
240
+ benchmark_selector: Benchmark selector instance
241
+ auth_manager: Authentication manager instance
242
+
243
+ Returns:
244
+ gr.Blocks: Gradio Blocks component with benchmark selection UI
245
+ """
246
+ with gr.Blocks() as benchmark_ui:
247
+ gr.Markdown("## 📊 Dynamic Highscores Benchmark Selection")
248
+ gr.Markdown("""
249
+ ### Add your own datasets from HuggingFace as benchmarks!
250
+
251
+ You can add any dataset from HuggingFace to use as a benchmark for evaluating models.
252
+ Simply enter the dataset ID (e.g., 'squad', 'glue', 'hellaswag') and add it as a benchmark.
253
+
254
+ Other users will be able to select your added benchmarks for their model evaluations.
255
+ """, elem_classes=["info-text"])
256
+
257
+ with gr.Tabs() as tabs:
258
+ with gr.TabItem("➕ Add New Benchmark", id=0):
259
+ with gr.Row():
260
+ with gr.Column(scale=3):
261
+ search_input = gr.Textbox(
262
+ placeholder="Search for datasets on HuggingFace...",
263
+ label="Search",
264
+ show_label=False
265
+ )
266
+
267
+ with gr.Column(scale=1):
268
+ category_dropdown = gr.Dropdown(
269
+ choices=benchmark_selector.categories,
270
+ value="All",
271
+ label="Category"
272
+ )
273
+
274
+ with gr.Column(scale=1):
275
+ search_button = gr.Button("Search")
276
+
277
+ dataset_results = gr.Dataframe(
278
+ headers=["Name", "Author", "Description", "Downloads"],
279
+ datatype=["str", "str", "str", "number"],
280
+ label="Search Results",
281
+ interactive=True
282
+ )
283
+
284
+ with gr.Row():
285
+ with gr.Column(scale=2):
286
+ dataset_id_input = gr.Textbox(
287
+ placeholder="Enter HuggingFace dataset ID (e.g., 'squad', 'glue', 'hellaswag')",
288
+ label="Dataset ID",
289
+ info="You can enter any dataset ID from HuggingFace"
290
+ )
291
+
292
+ with gr.Column(scale=1):
293
+ view_button = gr.Button("View Dataset Details")
294
+
295
+ with gr.Accordion("Dataset Details", open=False):
296
+ dataset_info = gr.JSON(label="Dataset Information")
297
+
298
+ with gr.Row():
299
+ config_dropdown = gr.Dropdown(
300
+ label="Configuration",
301
+ choices=[],
302
+ interactive=True
303
+ )
304
+
305
+ split_dropdown = gr.Dropdown(
306
+ label="Split",
307
+ choices=["train", "validation", "test"],
308
+ value="train",
309
+ interactive=True
310
+ )
311
+
312
+ sample_button = gr.Button("Load Sample")
313
+
314
+ sample_data = gr.Dataframe(
315
+ label="Sample Data",
316
+ interactive=False
317
+ )
318
+
319
+ gr.Markdown("### Add this dataset as a benchmark")
320
+ with gr.Row():
321
+ with gr.Column(scale=2):
322
+ benchmark_name = gr.Textbox(
323
+ placeholder="Enter a name for this benchmark",
324
+ label="Benchmark Name",
325
+ info="A descriptive name for this benchmark"
326
+ )
327
+
328
+ benchmark_description = gr.Textbox(
329
+ placeholder="Enter a description for this benchmark",
330
+ label="Description",
331
+ info="Explain what this benchmark evaluates",
332
+ lines=3
333
+ )
334
+
335
+ with gr.Column(scale=1):
336
+ metrics_input = gr.CheckboxGroup(
337
+ label="Evaluation Metrics",
338
+ choices=[],
339
+ interactive=True,
340
+ info="Select metrics to use for evaluation"
341
+ )
342
+
343
+ with gr.Row():
344
+ add_benchmark_button = gr.Button("Add as Benchmark", size="lg", variant="primary")
345
+
346
+ benchmark_status = gr.Markdown("")
347
+
348
+ with gr.TabItem("📋 Available Benchmarks", id=1):
349
+ gr.Markdown("### Benchmarks available for model evaluation")
350
+ gr.Markdown("These benchmarks can be selected when submitting models for evaluation.")
351
+
352
+ with gr.Row():
353
+ refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
354
+
355
+ benchmarks_container = gr.Column()
356
+ with benchmarks_container:
357
+ no_benchmarks_message = gr.Markdown(
358
+ "### No Datasets Added Yet\n\nBe the first to add a benchmark dataset! Go to the 'Add New Benchmark' tab to add a dataset from HuggingFace.",
359
+ visible=True
360
+ )
361
+
362
+ my_benchmarks = gr.Dataframe(
363
+ headers=["ID", "Name", "Dataset", "Description"],
364
+ label="Available Benchmarks",
365
+ interactive=True,
366
+ visible=False
367
+ )
368
+
369
+ # Event handlers
370
+ def search_datasets_handler(query, category):
371
+ if not query:
372
+ return None
373
+
374
+ results = benchmark_selector.search_datasets(query, category)
375
+
376
+ # Format for dataframe
377
+ formatted_results = []
378
+ for result in results:
379
+ formatted_results.append([
380
+ result["name"],
381
+ result["author"],
382
+ result["description"],
383
+ result["downloads"]
384
+ ])
385
+
386
+ return formatted_results
387
+
388
+ def view_dataset_handler(dataset_id):
389
+ if not dataset_id:
390
+ return None, [], None
391
+
392
+ dataset_info = benchmark_selector.get_dataset_info(dataset_id)
393
+
394
+ if not dataset_info:
395
+ return None, [], None
396
+
397
+ # Update metrics based on dataset tags
398
+ metrics = []
399
+ for category, category_metrics in benchmark_selector.metric_templates.items():
400
+ if any(tag.lower() in [t.lower() for t in dataset_info["tags"]] for tag in category.lower().split()):
401
+ metrics.extend(category_metrics)
402
+
403
+ # Remove duplicates
404
+ metrics = list(set(metrics))
405
+
406
+ return dataset_info, dataset_info["configs"], gr.update(choices=metrics)
407
+
408
+ def load_sample_handler(dataset_id, config, split):
409
+ if not dataset_id:
410
+ return None
411
+
412
+ sample_info = benchmark_selector.load_dataset_sample(
413
+ dataset_id,
414
+ config=config if config else None,
415
+ split=split
416
+ )
417
+
418
+ if not sample_info:
419
+ return None
420
+
421
+ return sample_info["sample"]
422
+
423
+ def add_benchmark_handler(dataset_id, config, name, description, metrics, request: gr.Request):
424
+ if not dataset_id:
425
+ return "Please enter a dataset ID from HuggingFace."
426
+
427
+ # Check if user is logged in
428
+ user = auth_manager.check_login(request)
429
+
430
+ if not user:
431
+ return "Please log in to add benchmarks."
432
+
433
+ # Add benchmark
434
+ benchmark_id = benchmark_selector.add_benchmark(
435
+ dataset_id=dataset_id,
436
+ name=name if name else None,
437
+ description=description if description else None,
438
+ metrics=metrics if metrics else None,
439
+ config=config if config else None
440
+ )
441
+
442
+ if benchmark_id:
443
+ return f"✅ Benchmark added successfully with ID: {benchmark_id}\n\nThis dataset is now available for model evaluation. You can view it in the 'Available Benchmarks' tab."
444
+ else:
445
+ return "❌ Failed to add benchmark. Please check the dataset ID and try again."
446
+
447
+ def get_benchmarks_handler(request: gr.Request):
448
+ # Check if user is logged in
449
+ user = auth_manager.check_login(request)
450
+
451
+ if not user:
452
+ return gr.update(visible=True), gr.update(visible=False), None
453
+
454
+ # Get benchmarks
455
+ benchmarks = benchmark_selector.get_benchmarks()
456
+
457
+ # If no benchmarks, show message
458
+ if not benchmarks or len(benchmarks) == 0:
459
+ return gr.update(visible=True), gr.update(visible=False), None
460
+
461
+ # Format for dataframe
462
+ formatted_benchmarks = []
463
+ for benchmark in benchmarks:
464
+ formatted_benchmarks.append([
465
+ benchmark["id"],
466
+ benchmark["name"],
467
+ benchmark["dataset_id"],
468
+ benchmark["description"]
469
+ ])
470
+
471
+ return gr.update(visible=False), gr.update(visible=True), formatted_benchmarks
472
+
473
+ # Connect event handlers
474
+ search_button.click(
475
+ fn=search_datasets_handler,
476
+ inputs=[search_input, category_dropdown],
477
+ outputs=[dataset_results]
478
+ )
479
+
480
+ view_button.click(
481
+ fn=view_dataset_handler,
482
+ inputs=[dataset_id_input],
483
+ outputs=[dataset_info, config_dropdown, metrics_input]
484
+ )
485
+
486
+ sample_button.click(
487
+ fn=load_sample_handler,
488
+ inputs=[dataset_id_input, config_dropdown, split_dropdown],
489
+ outputs=[sample_data]
490
+ )
491
+
492
+ add_benchmark_button.click(
493
+ fn=add_benchmark_handler,
494
+ inputs=[dataset_id_input, config_dropdown, benchmark_name, benchmark_description, metrics_input],
495
+ outputs=[benchmark_status]
496
+ )
497
+
498
+ refresh_benchmarks_button.click(
499
+ fn=get_benchmarks_handler,
500
+ inputs=[],
501
+ outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
502
+ )
503
+
504
+ # Initialize benchmarks on load
505
+ benchmark_ui.load(
506
+ fn=get_benchmarks_handler,
507
+ inputs=[],
508
+ outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
509
+ )
510
+
511
+ return benchmark_ui
database_schema.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database schema for Dynamic Highscores system.
3
+
4
+ This module defines the SQLite database schema for the Dynamic Highscores system,
5
+ which integrates benchmark selection, model evaluation, and leaderboard functionality.
6
+ """
7
+
8
+ import sqlite3
9
+ import os
10
+ import json
11
+ from datetime import datetime, timedelta
12
+ import pandas as pd
13
+
14
+ class DynamicHighscoresDB:
15
+ """Database manager for the Dynamic Highscores system."""
16
+
17
+ def __init__(self, db_path="dynamic_highscores.db"):
18
+ """Initialize the database connection and create tables if they don't exist."""
19
+ self.db_path = db_path
20
+ self.conn = None
21
+ self.cursor = None
22
+ self.connect()
23
+ self.create_tables()
24
+
25
+ def connect(self):
26
+ """Connect to the SQLite database."""
27
+ self.conn = sqlite3.connect(self.db_path)
28
+ self.conn.row_factory = sqlite3.Row
29
+ self.cursor = self.conn.cursor()
30
+
31
+ def close(self):
32
+ """Close the database connection."""
33
+ if self.conn:
34
+ self.conn.close()
35
+
36
+ def create_tables(self):
37
+ """Create all necessary tables if they don't exist."""
38
+ # Users table - stores user information
39
+ self.cursor.execute('''
40
+ CREATE TABLE IF NOT EXISTS users (
41
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
42
+ username TEXT UNIQUE NOT NULL,
43
+ hf_user_id TEXT UNIQUE NOT NULL,
44
+ is_admin BOOLEAN DEFAULT 0,
45
+ last_submission_date TEXT,
46
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
47
+ )
48
+ ''')
49
+
50
+ # Benchmarks table - stores information about available benchmarks
51
+ self.cursor.execute('''
52
+ CREATE TABLE IF NOT EXISTS benchmarks (
53
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
54
+ name TEXT NOT NULL,
55
+ dataset_id TEXT NOT NULL,
56
+ description TEXT,
57
+ metrics TEXT, -- JSON string of metrics
58
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
59
+ )
60
+ ''')
61
+
62
+ # Models table - stores information about submitted models
63
+ self.cursor.execute('''
64
+ CREATE TABLE IF NOT EXISTS models (
65
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
66
+ name TEXT NOT NULL,
67
+ hf_model_id TEXT NOT NULL,
68
+ user_id INTEGER NOT NULL,
69
+ tag TEXT NOT NULL, -- One of: Merge, Agent, Reasoning, Coding, etc.
70
+ parameters TEXT, -- Number of parameters (can be NULL)
71
+ description TEXT,
72
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP,
73
+ FOREIGN KEY (user_id) REFERENCES users (id),
74
+ UNIQUE (hf_model_id, user_id)
75
+ )
76
+ ''')
77
+
78
+ # Evaluations table - stores evaluation results
79
+ self.cursor.execute('''
80
+ CREATE TABLE IF NOT EXISTS evaluations (
81
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
82
+ model_id INTEGER NOT NULL,
83
+ benchmark_id INTEGER NOT NULL,
84
+ status TEXT NOT NULL, -- pending, running, completed, failed
85
+ results TEXT, -- JSON string of results
86
+ score REAL, -- Overall score (can be NULL)
87
+ submitted_at TEXT DEFAULT CURRENT_TIMESTAMP,
88
+ completed_at TEXT,
89
+ FOREIGN KEY (model_id) REFERENCES models (id),
90
+ FOREIGN KEY (benchmark_id) REFERENCES benchmarks (id)
91
+ )
92
+ ''')
93
+
94
+ # Queue table - stores evaluation queue
95
+ self.cursor.execute('''
96
+ CREATE TABLE IF NOT EXISTS queue (
97
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
98
+ evaluation_id INTEGER NOT NULL,
99
+ priority INTEGER DEFAULT 0, -- Higher number = higher priority
100
+ added_at TEXT DEFAULT CURRENT_TIMESTAMP,
101
+ FOREIGN KEY (evaluation_id) REFERENCES evaluations (id)
102
+ )
103
+ ''')
104
+
105
+ self.conn.commit()
106
+
107
+ # User management methods
108
+ def add_user(self, username, hf_user_id, is_admin=False):
109
+ """Add a new user to the database."""
110
+ try:
111
+ self.cursor.execute(
112
+ "INSERT INTO users (username, hf_user_id, is_admin) VALUES (?, ?, ?)",
113
+ (username, hf_user_id, is_admin)
114
+ )
115
+ self.conn.commit()
116
+ return self.cursor.lastrowid
117
+ except sqlite3.IntegrityError:
118
+ # User already exists
119
+ self.cursor.execute(
120
+ "SELECT id FROM users WHERE hf_user_id = ?",
121
+ (hf_user_id,)
122
+ )
123
+ return self.cursor.fetchone()[0]
124
+
125
+ def get_user(self, hf_user_id):
126
+ """Get user information by HuggingFace user ID."""
127
+ self.cursor.execute(
128
+ "SELECT * FROM users WHERE hf_user_id = ?",
129
+ (hf_user_id,)
130
+ )
131
+ return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
132
+
133
+ def can_submit_today(self, user_id):
134
+ """Check if a user can submit a benchmark evaluation today."""
135
+ self.cursor.execute(
136
+ "SELECT is_admin, last_submission_date FROM users WHERE id = ?",
137
+ (user_id,)
138
+ )
139
+ result = self.cursor.fetchone()
140
+
141
+ if not result:
142
+ return False
143
+
144
+ user_data = dict(result)
145
+
146
+ # Admin can always submit
147
+ if user_data['is_admin']:
148
+ return True
149
+
150
+ # If no previous submission, user can submit
151
+ if not user_data['last_submission_date']:
152
+ return True
153
+
154
+ # Check if last submission was before today
155
+ last_date = datetime.fromisoformat(user_data['last_submission_date'])
156
+ today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
157
+
158
+ return last_date < today
159
+
160
+ def update_submission_date(self, user_id):
161
+ """Update the last submission date for a user."""
162
+ current_time = datetime.now().isoformat()
163
+ self.cursor.execute(
164
+ "UPDATE users SET last_submission_date = ? WHERE id = ?",
165
+ (current_time, user_id)
166
+ )
167
+ self.conn.commit()
168
+
169
+ # Benchmark management methods
170
+ def add_benchmark(self, name, dataset_id, description="", metrics=None):
171
+ """Add a new benchmark to the database."""
172
+ if metrics is None:
173
+ metrics = {}
174
+
175
+ metrics_json = json.dumps(metrics)
176
+
177
+ try:
178
+ self.cursor.execute(
179
+ "INSERT INTO benchmarks (name, dataset_id, description, metrics) VALUES (?, ?, ?, ?)",
180
+ (name, dataset_id, description, metrics_json)
181
+ )
182
+ self.conn.commit()
183
+ return self.cursor.lastrowid
184
+ except sqlite3.IntegrityError:
185
+ # Benchmark already exists with this dataset_id
186
+ self.cursor.execute(
187
+ "SELECT id FROM benchmarks WHERE dataset_id = ?",
188
+ (dataset_id,)
189
+ )
190
+ return self.cursor.fetchone()[0]
191
+
192
+ def get_benchmarks(self):
193
+ """Get all available benchmarks."""
194
+ self.cursor.execute("SELECT * FROM benchmarks")
195
+ benchmarks = [dict(row) for row in self.cursor.fetchall()]
196
+
197
+ # Parse metrics JSON
198
+ for benchmark in benchmarks:
199
+ benchmark['metrics'] = json.loads(benchmark['metrics'])
200
+
201
+ return benchmarks
202
+
203
+ def get_benchmark(self, benchmark_id):
204
+ """Get benchmark information by ID."""
205
+ self.cursor.execute(
206
+ "SELECT * FROM benchmarks WHERE id = ?",
207
+ (benchmark_id,)
208
+ )
209
+ benchmark = dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
210
+
211
+ if benchmark:
212
+ benchmark['metrics'] = json.loads(benchmark['metrics'])
213
+
214
+ return benchmark
215
+
216
+ # Model management methods
217
+ def add_model(self, name, hf_model_id, user_id, tag, parameters=None, description=""):
218
+ """Add a new model to the database."""
219
+ try:
220
+ self.cursor.execute(
221
+ "INSERT INTO models (name, hf_model_id, user_id, tag, parameters, description) VALUES (?, ?, ?, ?, ?, ?)",
222
+ (name, hf_model_id, user_id, tag, parameters, description)
223
+ )
224
+ self.conn.commit()
225
+ return self.cursor.lastrowid
226
+ except sqlite3.IntegrityError:
227
+ # Model already exists for this user
228
+ self.cursor.execute(
229
+ "SELECT id FROM models WHERE hf_model_id = ? AND user_id = ?",
230
+ (hf_model_id, user_id)
231
+ )
232
+ return self.cursor.fetchone()[0]
233
+
234
+ def get_models(self, tag=None):
235
+ """Get all models, optionally filtered by tag."""
236
+ if tag:
237
+ self.cursor.execute(
238
+ "SELECT * FROM models WHERE tag = ?",
239
+ (tag,)
240
+ )
241
+ else:
242
+ self.cursor.execute("SELECT * FROM models")
243
+
244
+ return [dict(row) for row in self.cursor.fetchall()]
245
+
246
+ def get_model(self, model_id):
247
+ """Get model information by ID."""
248
+ self.cursor.execute(
249
+ "SELECT * FROM models WHERE id = ?",
250
+ (model_id,)
251
+ )
252
+ return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
253
+
254
+ # Evaluation management methods
255
+ def add_evaluation(self, model_id, benchmark_id, priority=0):
256
+ """Add a new evaluation to the database and queue."""
257
+ # First, add the evaluation
258
+ self.cursor.execute(
259
+ "INSERT INTO evaluations (model_id, benchmark_id, status) VALUES (?, ?, 'pending')",
260
+ (model_id, benchmark_id)
261
+ )
262
+ evaluation_id = self.cursor.lastrowid
263
+
264
+ # Then, add it to the queue
265
+ self.cursor.execute(
266
+ "INSERT INTO queue (evaluation_id, priority) VALUES (?, ?)",
267
+ (evaluation_id, priority)
268
+ )
269
+
270
+ self.conn.commit()
271
+ return evaluation_id
272
+
273
+ def update_evaluation_status(self, evaluation_id, status, results=None, score=None):
274
+ """Update the status of an evaluation."""
275
+ params = [status, evaluation_id]
276
+ sql = "UPDATE evaluations SET status = ?"
277
+
278
+ if results is not None:
279
+ sql += ", results = ?"
280
+ params.insert(1, json.dumps(results))
281
+
282
+ if score is not None:
283
+ sql += ", score = ?"
284
+ params.insert(1 if results is None else 2, score)
285
+
286
+ if status in ['completed', 'failed']:
287
+ sql += ", completed_at = ?"
288
+ params.insert(1 if results is None and score is None else (2 if results is None or score is None else 3),
289
+ datetime.now().isoformat())
290
+
291
+ sql += " WHERE id = ?"
292
+
293
+ self.cursor.execute(sql, params)
294
+ self.conn.commit()
295
+
296
+ # If completed or failed, remove from queue
297
+ if status in ['completed', 'failed']:
298
+ self.cursor.execute(
299
+ "DELETE FROM queue WHERE evaluation_id = ?",
300
+ (evaluation_id,)
301
+ )
302
+ self.conn.commit()
303
+
304
+ def get_next_in_queue(self):
305
+ """Get the next evaluation in the queue."""
306
+ self.cursor.execute("""
307
+ SELECT q.id as queue_id, q.evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
308
+ FROM queue q
309
+ JOIN evaluations e ON q.evaluation_id = e.id
310
+ JOIN models m ON e.model_id = m.id
311
+ JOIN benchmarks b ON e.benchmark_id = b.id
312
+ WHERE e.status = 'pending'
313
+ ORDER BY q.priority DESC, q.added_at ASC
314
+ LIMIT 1
315
+ """)
316
+
317
+ result = self.cursor.fetchone()
318
+ return dict(result) if result else None
319
+
320
+ def get_evaluation_results(self, model_id=None, benchmark_id=None, tag=None):
321
+ """Get evaluation results, optionally filtered by model, benchmark, or tag."""
322
+ sql = """
323
+ SELECT e.id, e.model_id, e.benchmark_id, e.status, e.results, e.score,
324
+ e.submitted_at, e.completed_at, m.name as model_name, m.tag,
325
+ b.name as benchmark_name
326
+ FROM evaluations e
327
+ JOIN models m ON e.model_id = m.id
328
+ JOIN benchmarks b ON e.benchmark_id = b.id
329
+ WHERE e.status = 'completed'
330
+ """
331
+
332
+ params = []
333
+
334
+ if model_id:
335
+ sql += " AND e.model_id = ?"
336
+ params.append(model_id)
337
+
338
+ if benchmark_id:
339
+ sql += " AND e.benchmark_id = ?"
340
+ params.append(benchmark_id)
341
+
342
+ if tag:
343
+ sql += " AND m.tag = ?"
344
+ params.append(tag)
345
+
346
+ sql += " ORDER BY e.completed_at DESC"
347
+
348
+ self.cursor.execute(sql, params)
349
+ results = [dict(row) for row in self.cursor.fetchall()]
350
+
351
+ # Parse results JSON
352
+ for result in results:
353
+ if result['results']:
354
+ result['results'] = json.loads(result['results'])
355
+
356
+ return results
357
+
358
+ def get_leaderboard_df(self, tag=None):
359
+ """Get a pandas DataFrame of the leaderboard, optionally filtered by tag."""
360
+ results = self.get_evaluation_results(tag=tag)
361
+
362
+ if not results:
363
+ return pd.DataFrame()
364
+
365
+ # Create a list of dictionaries for the DataFrame
366
+ leaderboard_data = []
367
+
368
+ for result in results:
369
+ entry = {
370
+ 'model_name': result['model_name'],
371
+ 'model_id': result['model_id'],
372
+ 'benchmark_name': result['benchmark_name'],
373
+ 'benchmark_id': result['benchmark_id'],
374
+ 'tag': result['tag'],
375
+ 'score': result['score'],
376
+ 'completed_at': result['completed_at']
377
+ }
378
+
379
+ # Add individual metrics from results
380
+ if result['results'] and isinstance(result['results'], dict):
381
+ for metric, value in result['results'].items():
382
+ if isinstance(value, (int, float)):
383
+ entry[f'metric_{metric}'] = value
384
+
385
+ leaderboard_data.append(entry)
386
+
387
+ return pd.DataFrame(leaderboard_data)
388
+
389
+ # Initialize the database
390
+ def init_db(db_path="dynamic_highscores.db"):
391
+ """Initialize the database and return the database manager."""
392
+ db = DynamicHighscoresDB(db_path)
393
+ return db
evaluation_queue.py ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model evaluation queue system for Dynamic Highscores.
3
+
4
+ This module handles the evaluation queue, CPU-only processing,
5
+ and enforces daily submission limits for users.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import time
11
+ import threading
12
+ import queue
13
+ from datetime import datetime, timedelta
14
+ import gradio as gr
15
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
16
+ from datasets import load_dataset
17
+ import torch
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
+ import sqlite3
20
+
21
+ class EvaluationQueue:
22
+ """Manages the evaluation queue for model benchmarking."""
23
+
24
+ def __init__(self, db_manager, auth_manager):
25
+ """Initialize the evaluation queue manager.
26
+
27
+ Args:
28
+ db_manager: Database manager instance
29
+ auth_manager: Authentication manager instance
30
+ """
31
+ self.db_manager = db_manager
32
+ self.auth_manager = auth_manager
33
+ self.hf_api = HfApi()
34
+ self.queue = queue.Queue()
35
+ self.is_processing = False
36
+ self.worker_thread = None
37
+ self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
38
+ self.current_evaluation = None
39
+ self.progress = 0
40
+ self.progress_lock = threading.Lock()
41
+ self.db_path = db_manager.db_path # Store the path to create new connections in worker thread
42
+
43
+ def start_worker(self):
44
+ """Start the worker thread for processing the evaluation queue."""
45
+ if self.worker_thread is None or not self.worker_thread.is_alive():
46
+ self.is_processing = True
47
+ self.worker_thread = threading.Thread(target=self._process_queue)
48
+ self.worker_thread.daemon = True
49
+ self.worker_thread.start()
50
+
51
+ def stop_worker(self):
52
+ """Stop the worker thread."""
53
+ self.is_processing = False
54
+ if self.worker_thread and self.worker_thread.is_alive():
55
+ self.worker_thread.join(timeout=1.0)
56
+
57
+ def _process_queue(self):
58
+ """Process the evaluation queue in a separate thread."""
59
+ # Create a new database connection for this thread
60
+ thread_db = sqlite3.connect(self.db_path)
61
+ thread_db.row_factory = sqlite3.Row
62
+
63
+ while self.is_processing:
64
+ try:
65
+ # Get the next evaluation from the database using thread-local connection
66
+ cursor = thread_db.cursor()
67
+ cursor.execute("""
68
+ SELECT e.id as evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
69
+ FROM queue q
70
+ JOIN evaluations e ON q.evaluation_id = e.id
71
+ JOIN models m ON e.model_id = m.id
72
+ JOIN benchmarks b ON e.benchmark_id = b.id
73
+ WHERE e.status = 'pending'
74
+ ORDER BY q.priority DESC, q.created_at ASC
75
+ LIMIT 1
76
+ """)
77
+ row = cursor.fetchone()
78
+
79
+ if row:
80
+ next_eval = dict(row)
81
+
82
+ # Update status to running
83
+ cursor.execute("""
84
+ UPDATE evaluations
85
+ SET status = 'running', started_at = datetime('now')
86
+ WHERE id = ?
87
+ """, (next_eval['evaluation_id'],))
88
+ thread_db.commit()
89
+
90
+ # Set current evaluation and reset progress
91
+ with self.progress_lock:
92
+ self.current_evaluation = next_eval
93
+ self.progress = 0
94
+
95
+ try:
96
+ # Run the evaluation
97
+ results = self._run_evaluation(
98
+ next_eval['hf_model_id'],
99
+ next_eval['dataset_id']
100
+ )
101
+
102
+ # Calculate overall score
103
+ score = self._calculate_overall_score(results)
104
+
105
+ # Update status to completed with results
106
+ cursor.execute("""
107
+ UPDATE evaluations
108
+ SET status = 'completed',
109
+ completed_at = datetime('now'),
110
+ results = ?,
111
+ score = ?
112
+ WHERE id = ?
113
+ """, (json.dumps(results), score, next_eval['evaluation_id']))
114
+ thread_db.commit()
115
+ except Exception as e:
116
+ print(f"Evaluation error: {e}")
117
+ # Update status to failed
118
+ cursor.execute("""
119
+ UPDATE evaluations
120
+ SET status = 'failed', completed_at = datetime('now')
121
+ WHERE id = ?
122
+ """, (next_eval['evaluation_id'],))
123
+ thread_db.commit()
124
+
125
+ # Clear current evaluation
126
+ with self.progress_lock:
127
+ self.current_evaluation = None
128
+ self.progress = 0
129
+ else:
130
+ # No evaluations in queue, sleep for a bit
131
+ time.sleep(5)
132
+ except Exception as e:
133
+ print(f"Queue processing error: {e}")
134
+ time.sleep(5)
135
+
136
+ # Close the thread-local database connection
137
+ thread_db.close()
138
+
139
+ def _run_evaluation(self, model_id, dataset_id):
140
+ """Run an evaluation for a model on a benchmark.
141
+
142
+ Args:
143
+ model_id: HuggingFace model ID
144
+ dataset_id: HuggingFace dataset ID (with optional config)
145
+
146
+ Returns:
147
+ dict: Evaluation results
148
+ """
149
+ # Update progress
150
+ with self.progress_lock:
151
+ self.progress = 5 # Starting evaluation
152
+
153
+ # Parse dataset ID and config
154
+ if ":" in dataset_id:
155
+ dataset_id, config = dataset_id.split(":", 1)
156
+ else:
157
+ config = None
158
+
159
+ # Update progress
160
+ with self.progress_lock:
161
+ self.progress = 10 # Loading dataset
162
+
163
+ # Load the dataset
164
+ if config:
165
+ dataset = load_dataset(dataset_id, config, split="test")
166
+ else:
167
+ dataset = load_dataset(dataset_id, split="test")
168
+
169
+ # Update progress
170
+ with self.progress_lock:
171
+ self.progress = 20 # Loading model
172
+
173
+ # Load the model (CPU only)
174
+ device = "cpu"
175
+ model = AutoModelForCausalLM.from_pretrained(
176
+ model_id,
177
+ device_map=device,
178
+ torch_dtype=torch.float32, # Use float32 for CPU
179
+ low_cpu_mem_usage=True
180
+ )
181
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
182
+
183
+ # Update progress
184
+ with self.progress_lock:
185
+ self.progress = 30 # Determining task type
186
+
187
+ # Determine task type based on dataset features
188
+ task_type = self._determine_task_type(dataset)
189
+
190
+ # Update progress
191
+ with self.progress_lock:
192
+ self.progress = 40 # Starting evaluation
193
+
194
+ # Run appropriate evaluation based on task type
195
+ if task_type == "text-generation":
196
+ results = self._evaluate_text_generation(model, tokenizer, dataset)
197
+ elif task_type == "question-answering":
198
+ results = self._evaluate_question_answering(model, tokenizer, dataset)
199
+ elif task_type == "classification":
200
+ results = self._evaluate_classification(model, tokenizer, dataset)
201
+ elif task_type == "code-generation":
202
+ results = self._evaluate_code_generation(model, tokenizer, dataset)
203
+ else:
204
+ # Default to general evaluation
205
+ results = self._evaluate_general(model, tokenizer, dataset)
206
+
207
+ # Update progress
208
+ with self.progress_lock:
209
+ self.progress = 95 # Cleaning up
210
+
211
+ # Clean up to free memory
212
+ del model
213
+ del tokenizer
214
+ torch.cuda.empty_cache()
215
+
216
+ # Update progress
217
+ with self.progress_lock:
218
+ self.progress = 100 # Completed
219
+
220
+ return results
221
+
222
+ def get_current_progress(self):
223
+ """Get the current evaluation progress.
224
+
225
+ Returns:
226
+ tuple: (current_evaluation, progress_percentage)
227
+ """
228
+ with self.progress_lock:
229
+ return self.current_evaluation, self.progress
230
+
231
+ def _determine_task_type(self, dataset):
232
+ """Determine the task type based on dataset features.
233
+
234
+ Args:
235
+ dataset: HuggingFace dataset
236
+
237
+ Returns:
238
+ str: Task type
239
+ """
240
+ features = dataset.features
241
+
242
+ # Check for common feature patterns
243
+ if "question" in features and "answer" in features:
244
+ return "question-answering"
245
+ elif "code" in features or "solution" in features:
246
+ return "code-generation"
247
+ elif "label" in features or "class" in features:
248
+ return "classification"
249
+ elif "input" in features and "output" in features:
250
+ return "text-generation"
251
+ else:
252
+ return "general"
253
+
254
+ def _evaluate_text_generation(self, model, tokenizer, dataset):
255
+ """Evaluate a model on text generation tasks.
256
+
257
+ Args:
258
+ model: HuggingFace model
259
+ tokenizer: HuggingFace tokenizer
260
+ dataset: HuggingFace dataset
261
+
262
+ Returns:
263
+ dict: Evaluation results
264
+ """
265
+ # Set up generation pipeline
266
+ generator = pipeline(
267
+ "text-generation",
268
+ model=model,
269
+ tokenizer=tokenizer,
270
+ device="cpu"
271
+ )
272
+
273
+ # Sample a subset for evaluation (to keep runtime reasonable)
274
+ if len(dataset) > 100:
275
+ dataset = dataset.select(range(100))
276
+
277
+ # Track metrics
278
+ correct = 0
279
+ total = 0
280
+ generated_texts = []
281
+
282
+ # Process each example
283
+ for i, example in enumerate(dataset):
284
+ # Update progress based on completion percentage
285
+ with self.progress_lock:
286
+ self.progress = 40 + int((i / len(dataset)) * 50)
287
+
288
+ input_text = example.get("input", example.get("prompt", ""))
289
+ expected_output = example.get("output", example.get("target", ""))
290
+
291
+ if not input_text or not expected_output:
292
+ continue
293
+
294
+ # Generate text
295
+ generated = generator(
296
+ input_text,
297
+ max_length=100,
298
+ num_return_sequences=1
299
+ )
300
+
301
+ generated_text = generated[0]["generated_text"]
302
+ generated_texts.append(generated_text)
303
+
304
+ # Simple exact match check
305
+ if expected_output.strip() in generated_text:
306
+ correct += 1
307
+
308
+ total += 1
309
+
310
+ # Calculate metrics
311
+ accuracy = correct / total if total > 0 else 0
312
+
313
+ return {
314
+ "accuracy": accuracy,
315
+ "samples_evaluated": total,
316
+ "generated_samples": generated_texts[:5] # Include a few samples
317
+ }
318
+
319
+ def _evaluate_question_answering(self, model, tokenizer, dataset):
320
+ """Evaluate a model on question answering tasks.
321
+
322
+ Args:
323
+ model: HuggingFace model
324
+ tokenizer: HuggingFace tokenizer
325
+ dataset: HuggingFace dataset
326
+
327
+ Returns:
328
+ dict: Evaluation results
329
+ """
330
+ # Set up QA pipeline
331
+ qa_pipeline = pipeline(
332
+ "question-answering",
333
+ model=model,
334
+ tokenizer=tokenizer,
335
+ device="cpu"
336
+ )
337
+
338
+ # Sample a subset for evaluation
339
+ if len(dataset) > 100:
340
+ dataset = dataset.select(range(100))
341
+
342
+ # Track metrics
343
+ exact_matches = 0
344
+ f1_scores = []
345
+ total = 0
346
+
347
+ # Process each example
348
+ for i, example in enumerate(dataset):
349
+ # Update progress based on completion percentage
350
+ with self.progress_lock:
351
+ self.progress = 40 + int((i / len(dataset)) * 50)
352
+
353
+ question = example.get("question", "")
354
+ context = example.get("context", "")
355
+ answer = example.get("answer", "")
356
+
357
+ if not question or not answer:
358
+ continue
359
+
360
+ # Get model prediction
361
+ if context:
362
+ result = qa_pipeline(question=question, context=context)
363
+ else:
364
+ # If no context provided, use the question as context
365
+ result = qa_pipeline(question=question, context=question)
366
+
367
+ predicted_answer = result["answer"]
368
+
369
+ # Calculate exact match
370
+ if predicted_answer.strip() == answer.strip():
371
+ exact_matches += 1
372
+
373
+ # Calculate F1 score
374
+ f1 = self._calculate_f1(answer, predicted_answer)
375
+ f1_scores.append(f1)
376
+
377
+ total += 1
378
+
379
+ # Calculate metrics
380
+ exact_match_accuracy = exact_matches / total if total > 0 else 0
381
+ avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
382
+
383
+ return {
384
+ "exact_match": exact_match_accuracy,
385
+ "f1": avg_f1,
386
+ "samples_evaluated": total
387
+ }
388
+
389
+ def _evaluate_classification(self, model, tokenizer, dataset):
390
+ """Evaluate a model on classification tasks.
391
+
392
+ Args:
393
+ model: HuggingFace model
394
+ tokenizer: HuggingFace tokenizer
395
+ dataset: HuggingFace dataset
396
+
397
+ Returns:
398
+ dict: Evaluation results
399
+ """
400
+ # Set up classification pipeline
401
+ classifier = pipeline(
402
+ "text-classification",
403
+ model=model,
404
+ tokenizer=tokenizer,
405
+ device="cpu"
406
+ )
407
+
408
+ # Sample a subset for evaluation
409
+ if len(dataset) > 100:
410
+ dataset = dataset.select(range(100))
411
+
412
+ # Track metrics
413
+ correct = 0
414
+ total = 0
415
+
416
+ # Process each example
417
+ for i, example in enumerate(dataset):
418
+ # Update progress based on completion percentage
419
+ with self.progress_lock:
420
+ self.progress = 40 + int((i / len(dataset)) * 50)
421
+
422
+ text = example.get("text", example.get("sentence", ""))
423
+ label = str(example.get("label", example.get("class", "")))
424
+
425
+ if not text or not label:
426
+ continue
427
+
428
+ # Get model prediction
429
+ result = classifier(text)
430
+ predicted_label = result[0]["label"]
431
+
432
+ # Check if correct
433
+ if str(predicted_label) == label:
434
+ correct += 1
435
+
436
+ total += 1
437
+
438
+ # Calculate metrics
439
+ accuracy = correct / total if total > 0 else 0
440
+
441
+ return {
442
+ "accuracy": accuracy,
443
+ "samples_evaluated": total
444
+ }
445
+
446
+ def _evaluate_code_generation(self, model, tokenizer, dataset):
447
+ """Evaluate a model on code generation tasks.
448
+
449
+ Args:
450
+ model: HuggingFace model
451
+ tokenizer: HuggingFace tokenizer
452
+ dataset: HuggingFace dataset
453
+
454
+ Returns:
455
+ dict: Evaluation results
456
+ """
457
+ # Set up generation pipeline
458
+ generator = pipeline(
459
+ "text-generation",
460
+ model=model,
461
+ tokenizer=tokenizer,
462
+ device="cpu"
463
+ )
464
+
465
+ # Sample a subset for evaluation
466
+ if len(dataset) > 50: # Smaller sample for code tasks
467
+ dataset = dataset.select(range(50))
468
+
469
+ # Track metrics
470
+ exact_matches = 0
471
+ functional_matches = 0
472
+ total = 0
473
+
474
+ # Process each example
475
+ for i, example in enumerate(dataset):
476
+ # Update progress based on completion percentage
477
+ with self.progress_lock:
478
+ self.progress = 40 + int((i / len(dataset)) * 50)
479
+
480
+ prompt = example.get("prompt", example.get("input", ""))
481
+ solution = example.get("solution", example.get("output", ""))
482
+
483
+ if not prompt or not solution:
484
+ continue
485
+
486
+ # Generate code
487
+ generated = generator(
488
+ prompt,
489
+ max_length=200,
490
+ num_return_sequences=1
491
+ )
492
+
493
+ generated_code = generated[0]["generated_text"]
494
+
495
+ # Extract code from generated text (remove prompt)
496
+ if prompt in generated_code:
497
+ generated_code = generated_code[len(prompt):].strip()
498
+
499
+ # Check exact match
500
+ if generated_code.strip() == solution.strip():
501
+ exact_matches += 1
502
+ functional_matches += 1
503
+ else:
504
+ # We would ideally check functional correctness here
505
+ # but that requires executing code which is complex and potentially unsafe
506
+ # For now, we'll use a simple heuristic
507
+ if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
508
+ functional_matches += 0.5 # Partial credit
509
+
510
+ total += 1
511
+
512
+ # Calculate metrics
513
+ exact_match_rate = exact_matches / total if total > 0 else 0
514
+ functional_correctness = functional_matches / total if total > 0 else 0
515
+
516
+ return {
517
+ "exact_match": exact_match_rate,
518
+ "functional_correctness": functional_correctness,
519
+ "samples_evaluated": total
520
+ }
521
+
522
+ def _evaluate_general(self, model, tokenizer, dataset):
523
+ """General evaluation for any dataset type.
524
+
525
+ Args:
526
+ model: HuggingFace model
527
+ tokenizer: HuggingFace tokenizer
528
+ dataset: HuggingFace dataset
529
+
530
+ Returns:
531
+ dict: Evaluation results
532
+ """
533
+ # Set up generation pipeline
534
+ generator = pipeline(
535
+ "text-generation",
536
+ model=model,
537
+ tokenizer=tokenizer,
538
+ device="cpu"
539
+ )
540
+
541
+ # Sample a subset for evaluation
542
+ if len(dataset) > 50:
543
+ dataset = dataset.select(range(50))
544
+
545
+ # Find input and output fields
546
+ features = dataset.features
547
+ input_field = None
548
+ output_field = None
549
+
550
+ for field in features:
551
+ if field.lower() in ["input", "prompt", "question", "text"]:
552
+ input_field = field
553
+ elif field.lower() in ["output", "target", "answer", "response"]:
554
+ output_field = field
555
+
556
+ if not input_field:
557
+ # Just use the first string field as input
558
+ for field in features:
559
+ if isinstance(features[field], (str, list)):
560
+ input_field = field
561
+ break
562
+
563
+ # Track metrics
564
+ total = 0
565
+ generated_texts = []
566
+
567
+ # Process each example
568
+ for i, example in enumerate(dataset):
569
+ # Update progress based on completion percentage
570
+ with self.progress_lock:
571
+ self.progress = 40 + int((i / len(dataset)) * 50)
572
+
573
+ if input_field and input_field in example:
574
+ input_text = str(example[input_field])
575
+
576
+ # Generate text
577
+ generated = generator(
578
+ input_text,
579
+ max_length=100,
580
+ num_return_sequences=1
581
+ )
582
+
583
+ generated_text = generated[0]["generated_text"]
584
+ generated_texts.append({
585
+ "input": input_text,
586
+ "output": generated_text,
587
+ "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
588
+ })
589
+
590
+ total += 1
591
+
592
+ return {
593
+ "samples_evaluated": total,
594
+ "generated_samples": generated_texts[:5] # Include a few samples
595
+ }
596
+
597
+ def _calculate_f1(self, answer, prediction):
598
+ """Calculate F1 score between answer and prediction.
599
+
600
+ Args:
601
+ answer: Ground truth answer
602
+ prediction: Model prediction
603
+
604
+ Returns:
605
+ float: F1 score
606
+ """
607
+ # Tokenize
608
+ answer_tokens = answer.lower().split()
609
+ prediction_tokens = prediction.lower().split()
610
+
611
+ # Calculate precision and recall
612
+ common_tokens = set(answer_tokens) & set(prediction_tokens)
613
+
614
+ if not common_tokens:
615
+ return 0.0
616
+
617
+ precision = len(common_tokens) / len(prediction_tokens)
618
+ recall = len(common_tokens) / len(answer_tokens)
619
+
620
+ # Calculate F1
621
+ if precision + recall == 0:
622
+ return 0.0
623
+
624
+ f1 = 2 * precision * recall / (precision + recall)
625
+ return f1
626
+
627
+ def _calculate_overall_score(self, results):
628
+ """Calculate an overall score from evaluation results.
629
+
630
+ Args:
631
+ results: Evaluation results dictionary
632
+
633
+ Returns:
634
+ float: Overall score between 0 and 100
635
+ """
636
+ score = 0.0
637
+
638
+ # Check for common metrics and weight them
639
+ if "accuracy" in results:
640
+ score += results["accuracy"] * 100
641
+
642
+ if "exact_match" in results:
643
+ score += results["exact_match"] * 100
644
+
645
+ if "f1" in results:
646
+ score += results["f1"] * 100
647
+
648
+ if "functional_correctness" in results:
649
+ score += results["functional_correctness"] * 100
650
+
651
+ # If multiple metrics were found, average them
652
+ num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
653
+
654
+ if num_metrics > 0:
655
+ score /= num_metrics
656
+ else:
657
+ # Default score if no metrics available
658
+ score = 50.0
659
+
660
+ return score
661
+
662
+ def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
663
+ """Submit a model for evaluation on a benchmark.
664
+
665
+ Args:
666
+ model_id: Model ID in the database
667
+ benchmark_id: Benchmark ID in the database
668
+ user_id: User ID submitting the evaluation
669
+ priority: Queue priority (higher = higher priority)
670
+
671
+ Returns:
672
+ int: Evaluation ID if successful, None otherwise
673
+ """
674
+ # Check if user can submit today
675
+ if not self.auth_manager.can_submit_benchmark(user_id):
676
+ return None, "Daily submission limit reached. Try again tomorrow."
677
+
678
+ try:
679
+ # Add evaluation to database and queue
680
+ evaluation_id = self.db_manager.add_evaluation(
681
+ model_id=model_id,
682
+ benchmark_id=benchmark_id,
683
+ priority=priority
684
+ )
685
+
686
+ # Update user's last submission date
687
+ self.auth_manager.update_submission_date(user_id)
688
+
689
+ # Make sure worker is running
690
+ self.start_worker()
691
+
692
+ return evaluation_id, "Evaluation submitted successfully."
693
+ except Exception as e:
694
+ print(f"Submit evaluation error: {e}")
695
+ return None, f"Failed to submit evaluation: {str(e)}"
696
+
697
+ def get_queue_status(self):
698
+ """Get the current status of the evaluation queue.
699
+
700
+ Returns:
701
+ dict: Queue status information
702
+ """
703
+ try:
704
+ # Get evaluations from database
705
+ pending_evals = self.db_manager.get_evaluation_results(status="pending")
706
+ running_evals = self.db_manager.get_evaluation_results(status="running")
707
+ completed_evals = self.db_manager.get_evaluation_results(status="completed")
708
+ failed_evals = self.db_manager.get_evaluation_results(status="failed")
709
+
710
+ # Get current evaluation progress
711
+ current_eval, progress = self.get_current_progress()
712
+
713
+ return {
714
+ "pending": len(pending_evals),
715
+ "running": len(running_evals),
716
+ "completed": len(completed_evals),
717
+ "failed": len(failed_evals),
718
+ "is_processing": self.is_processing,
719
+ "current_evaluation": current_eval,
720
+ "progress": progress
721
+ }
722
+ except Exception as e:
723
+ print(f"Queue status error: {e}")
724
+ return {
725
+ "pending": 0,
726
+ "running": 0,
727
+ "completed": 0,
728
+ "failed": 0,
729
+ "is_processing": self.is_processing,
730
+ "current_evaluation": None,
731
+ "progress": 0,
732
+ "error": str(e)
733
+ }
734
+
735
+ # Model submission UI components
736
+ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
737
+ """Create the model submission UI components.
738
+
739
+ Args:
740
+ evaluation_queue: Evaluation queue instance
741
+ auth_manager: Authentication manager instance
742
+ db_manager: Database manager instance
743
+
744
+ Returns:
745
+ gr.Blocks: Gradio Blocks component with model submission UI
746
+ """
747
+ with gr.Blocks() as submission_ui:
748
+ with gr.Tab("Submit Model"):
749
+ with gr.Row():
750
+ with gr.Column(scale=2):
751
+ model_id_input = gr.Textbox(
752
+ placeholder="HuggingFace model ID (e.g., 'gpt2', 'facebook/opt-350m')",
753
+ label="Model ID"
754
+ )
755
+
756
+ model_name_input = gr.Textbox(
757
+ placeholder="Display name for your model",
758
+ label="Model Name"
759
+ )
760
+
761
+ model_description_input = gr.Textbox(
762
+ placeholder="Brief description of your model",
763
+ label="Description",
764
+ lines=3
765
+ )
766
+
767
+ model_parameters_input = gr.Number(
768
+ label="Number of Parameters (billions)",
769
+ precision=2
770
+ )
771
+
772
+ with gr.Column(scale=1):
773
+ model_tag_input = gr.Dropdown(
774
+ choices=evaluation_queue.model_tags,
775
+ label="Model Tag",
776
+ info="Select one category that best describes your model"
777
+ )
778
+
779
+ benchmark_dropdown = gr.Dropdown(
780
+ label="Benchmark",
781
+ info="Select a benchmark to evaluate your model on"
782
+ )
783
+
784
+ refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
785
+
786
+ submit_model_button = gr.Button("Submit for Evaluation")
787
+ submission_status = gr.Markdown("")
788
+
789
+ with gr.Tab("Evaluation Queue"):
790
+ refresh_queue_button = gr.Button("Refresh Queue")
791
+
792
+ with gr.Row():
793
+ with gr.Column(scale=1):
794
+ queue_stats = gr.JSON(
795
+ label="Queue Statistics"
796
+ )
797
+
798
+ with gr.Column(scale=2):
799
+ queue_status = gr.Dataframe(
800
+ headers=["ID", "Model", "Benchmark", "Status", "Submitted"],
801
+ label="Recent Evaluations"
802
+ )
803
+
804
+ with gr.Row(visible=True) as progress_container:
805
+ with gr.Column():
806
+ current_eval_info = gr.Markdown("No evaluation currently running")
807
+ # Use a simple text display for progress instead of Progress component
808
+ progress_display = gr.Markdown("Progress: 0%")
809
+
810
+ # Function to update progress display
811
+ def update_progress_display():
812
+ current_eval, progress = evaluation_queue.get_current_progress()
813
+
814
+ if current_eval:
815
+ model_info = db_manager.get_model(current_eval['model_id'])
816
+ benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
817
+
818
+ if model_info and benchmark_info:
819
+ eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
820
+ progress_text = f"Progress: {progress}%"
821
+ return eval_info, progress_text
822
+
823
+ return "No evaluation currently running", "Progress: 0%"
824
+
825
+ # Event handlers
826
+ def refresh_benchmarks_handler():
827
+ benchmarks = db_manager.get_benchmarks()
828
+
829
+ # Format for dropdown
830
+ choices = [(b["id"], b["name"]) for b in benchmarks]
831
+
832
+ return gr.update(choices=choices)
833
+
834
+ def submit_model_handler(model_id, model_name, model_description, model_parameters, model_tag, benchmark_id, request: gr.Request):
835
+ # Check if user is logged in
836
+ user = auth_manager.check_login(request)
837
+
838
+ if not user:
839
+ return "Please log in to submit a model."
840
+
841
+ if not model_id or not model_name or not model_tag or not benchmark_id:
842
+ return "Please fill in all required fields."
843
+
844
+ try:
845
+ # Add model to database
846
+ model_db_id = db_manager.add_model(
847
+ name=model_name,
848
+ hf_model_id=model_id,
849
+ user_id=user["id"],
850
+ tag=model_tag,
851
+ parameters=str(model_parameters) if model_parameters else None,
852
+ description=model_description
853
+ )
854
+
855
+ if not model_db_id:
856
+ return "Failed to add model to database."
857
+
858
+ # Submit for evaluation
859
+ eval_id, message = evaluation_queue.submit_evaluation(
860
+ model_id=model_db_id,
861
+ benchmark_id=benchmark_id,
862
+ user_id=user["id"]
863
+ )
864
+
865
+ if eval_id:
866
+ return f"Model submitted successfully. Evaluation ID: {eval_id}"
867
+ else:
868
+ return message
869
+ except Exception as e:
870
+ return f"Error submitting model: {str(e)}"
871
+
872
+ def refresh_queue_handler():
873
+ # Get queue statistics
874
+ stats = evaluation_queue.get_queue_status()
875
+
876
+ # Get recent evaluations
877
+ evals = db_manager.get_evaluation_results(limit=20)
878
+
879
+ # Format for dataframe
880
+ eval_data = []
881
+ for eval in evals:
882
+ eval_data.append([
883
+ eval["id"],
884
+ eval["model_name"],
885
+ eval["benchmark_name"],
886
+ eval["status"],
887
+ eval["submitted_at"]
888
+ ])
889
+
890
+ # Also update progress display
891
+ current_eval, progress = evaluation_queue.get_current_progress()
892
+ if current_eval:
893
+ model_info = db_manager.get_model(current_eval['model_id'])
894
+ benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
895
+
896
+ if model_info and benchmark_info:
897
+ eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
898
+ progress_text = f"Progress: {progress}%"
899
+ return stats, eval_data, eval_info, progress_text
900
+
901
+ return stats, eval_data, "No evaluation currently running", "Progress: 0%"
902
+
903
+ # Connect event handlers
904
+ refresh_benchmarks_button.click(
905
+ fn=refresh_benchmarks_handler,
906
+ inputs=[],
907
+ outputs=[benchmark_dropdown]
908
+ )
909
+
910
+ submit_model_button.click(
911
+ fn=submit_model_handler,
912
+ inputs=[
913
+ model_id_input,
914
+ model_name_input,
915
+ model_description_input,
916
+ model_parameters_input,
917
+ model_tag_input,
918
+ benchmark_dropdown
919
+ ],
920
+ outputs=[submission_status]
921
+ )
922
+
923
+ refresh_queue_button.click(
924
+ fn=refresh_queue_handler,
925
+ inputs=[],
926
+ outputs=[queue_stats, queue_status, current_eval_info, progress_display]
927
+ )
928
+
929
+ # Initialize on load
930
+ submission_ui.load(
931
+ fn=refresh_benchmarks_handler,
932
+ inputs=[],
933
+ outputs=[benchmark_dropdown]
934
+ )
935
+
936
+ submission_ui.load(
937
+ fn=refresh_queue_handler,
938
+ inputs=[],
939
+ outputs=[queue_stats, queue_status, current_eval_info, progress_display]
940
+ )
941
+
942
+ # Set up auto-refresh for queue status
943
+ refresh_interval = 5 # seconds
944
+
945
+ # Create JavaScript for auto-refresh
946
+ js = f"""
947
+ function setupAutoRefresh() {{
948
+ setInterval(function() {{
949
+ document.getElementById("{refresh_queue_button.elem_id}").click();
950
+ }}, {refresh_interval * 1000});
951
+ }}
952
+
953
+ if (window.setup_done) {{
954
+ // Do nothing if already set up
955
+ }} else {{
956
+ setupAutoRefresh();
957
+ window.setup_done = true;
958
+ }}
959
+ """
960
+
961
+ # Add JavaScript to page
962
+ submission_ui.load(None, None, None, _js=js)
963
+
964
+ return submission_ui
leaderboard.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Leaderboard module for Dynamic Highscores system.
3
+
4
+ This module implements the unified leaderboard with tag-based filtering
5
+ for displaying all evaluated models.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import pandas as pd
11
+ import gradio as gr
12
+ import plotly.express as px
13
+ import plotly.graph_objects as go
14
+
15
+ class Leaderboard:
16
+ """Manages the unified leaderboard with filtering capabilities."""
17
+
18
+ def __init__(self, db_manager):
19
+ """Initialize the leaderboard manager.
20
+
21
+ Args:
22
+ db_manager: Database manager instance
23
+ """
24
+ self.db_manager = db_manager
25
+ self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
26
+
27
+ # Define color scheme for tags
28
+ self.tag_colors = {
29
+ "Merge": "#FF6B6B",
30
+ "Agent": "#4ECDC4",
31
+ "Reasoning": "#FFD166",
32
+ "Coding": "#6B5B95",
33
+ "General": "#88D8B0",
34
+ "Specialized": "#FF8C42",
35
+ "Instruction": "#5D9CEC",
36
+ "Chat": "#AC92EB"
37
+ }
38
+
39
+ def get_leaderboard_data(self, tag=None, benchmark_id=None):
40
+ """Get leaderboard data, optionally filtered by tag or benchmark.
41
+
42
+ Args:
43
+ tag: Model tag to filter by (None for all)
44
+ benchmark_id: Benchmark ID to filter by (None for all)
45
+
46
+ Returns:
47
+ pd.DataFrame: Leaderboard data
48
+ """
49
+ # Get evaluation results from database
50
+ if tag and tag != "All":
51
+ df = self.db_manager.get_leaderboard_df(tag=tag)
52
+ else:
53
+ df = self.db_manager.get_leaderboard_df()
54
+
55
+ # Filter by benchmark if specified
56
+ if benchmark_id and not df.empty:
57
+ df = df[df['benchmark_id'] == benchmark_id]
58
+
59
+ return df
60
+
61
+ def format_leaderboard_for_display(self, df):
62
+ """Format leaderboard data for display.
63
+
64
+ Args:
65
+ df: Leaderboard DataFrame
66
+
67
+ Returns:
68
+ pd.DataFrame: Formatted leaderboard for display
69
+ """
70
+ if df.empty:
71
+ return pd.DataFrame()
72
+
73
+ # Select and rename columns for display
74
+ display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
75
+ display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
76
+
77
+ # Round score to 2 decimal places
78
+ display_df['Score'] = display_df['Score'].round(2)
79
+
80
+ # Sort by score (descending)
81
+ display_df = display_df.sort_values('Score', ascending=False)
82
+
83
+ return display_df
84
+
85
+ def create_performance_chart(self, df, chart_type="bar"):
86
+ """Create a performance chart from leaderboard data.
87
+
88
+ Args:
89
+ df: Leaderboard DataFrame
90
+ chart_type: Type of chart to create ("bar" or "scatter")
91
+
92
+ Returns:
93
+ plotly.graph_objects.Figure: Performance chart
94
+ """
95
+ if df.empty:
96
+ # Return empty figure
97
+ fig = go.Figure()
98
+ fig.update_layout(
99
+ title="No data available",
100
+ xaxis_title="Model",
101
+ yaxis_title="Score"
102
+ )
103
+ return fig
104
+
105
+ # Prepare data for visualization
106
+ plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
107
+ plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
108
+
109
+ # Create chart based on type
110
+ if chart_type == "scatter":
111
+ fig = px.scatter(
112
+ plot_df,
113
+ x="Model",
114
+ y="Score",
115
+ color="Tag",
116
+ symbol="Benchmark",
117
+ size="Score",
118
+ hover_data=["Model", "Benchmark", "Score"],
119
+ color_discrete_map=self.tag_colors
120
+ )
121
+ else: # Default to bar chart
122
+ fig = px.bar(
123
+ plot_df,
124
+ x="Model",
125
+ y="Score",
126
+ color="Tag",
127
+ barmode="group",
128
+ hover_data=["Model", "Benchmark", "Score"],
129
+ color_discrete_map=self.tag_colors
130
+ )
131
+
132
+ # Customize layout
133
+ fig.update_layout(
134
+ title="Model Performance Comparison",
135
+ xaxis_title="Model",
136
+ yaxis_title="Score",
137
+ legend_title="Tag",
138
+ font=dict(size=12)
139
+ )
140
+
141
+ return fig
142
+
143
+ def create_tag_distribution_chart(self, df):
144
+ """Create a chart showing distribution of models by tag.
145
+
146
+ Args:
147
+ df: Leaderboard DataFrame
148
+
149
+ Returns:
150
+ plotly.graph_objects.Figure: Tag distribution chart
151
+ """
152
+ if df.empty:
153
+ # Return empty figure
154
+ fig = go.Figure()
155
+ fig.update_layout(
156
+ title="No data available",
157
+ xaxis_title="Tag",
158
+ yaxis_title="Count"
159
+ )
160
+ return fig
161
+
162
+ # Count models by tag
163
+ tag_counts = df['tag'].value_counts().reset_index()
164
+ tag_counts.columns = ['Tag', 'Count']
165
+
166
+ # Create pie chart
167
+ fig = px.pie(
168
+ tag_counts,
169
+ names='Tag',
170
+ values='Count',
171
+ title='Model Distribution by Tag',
172
+ color='Tag',
173
+ color_discrete_map=self.tag_colors
174
+ )
175
+
176
+ # Customize layout
177
+ fig.update_layout(
178
+ font=dict(size=12)
179
+ )
180
+
181
+ return fig
182
+
183
+ def create_benchmark_comparison_chart(self, df):
184
+ """Create a chart comparing performance across benchmarks.
185
+
186
+ Args:
187
+ df: Leaderboard DataFrame
188
+
189
+ Returns:
190
+ plotly.graph_objects.Figure: Benchmark comparison chart
191
+ """
192
+ if df.empty:
193
+ # Return empty figure
194
+ fig = go.Figure()
195
+ fig.update_layout(
196
+ title="No data available",
197
+ xaxis_title="Benchmark",
198
+ yaxis_title="Average Score"
199
+ )
200
+ return fig
201
+
202
+ # Calculate average score by benchmark
203
+ benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
204
+ benchmark_avg.columns = ['Benchmark', 'Average Score']
205
+
206
+ # Create bar chart
207
+ fig = px.bar(
208
+ benchmark_avg,
209
+ x='Benchmark',
210
+ y='Average Score',
211
+ title='Average Performance by Benchmark',
212
+ color='Benchmark'
213
+ )
214
+
215
+ # Customize layout
216
+ fig.update_layout(
217
+ xaxis_title="Benchmark",
218
+ yaxis_title="Average Score",
219
+ font=dict(size=12)
220
+ )
221
+
222
+ return fig
223
+
224
+ # Leaderboard UI components
225
+ def create_leaderboard_ui(leaderboard, db_manager):
226
+ """Create the leaderboard UI components.
227
+
228
+ Args:
229
+ leaderboard: Leaderboard instance
230
+ db_manager: Database manager instance
231
+
232
+ Returns:
233
+ gr.Blocks: Gradio Blocks component with leaderboard UI
234
+ """
235
+ with gr.Blocks() as leaderboard_ui:
236
+ gr.Markdown("# Dynamic Highscores Leaderboard")
237
+
238
+ with gr.Row():
239
+ with gr.Column(scale=1):
240
+ tag_filter = gr.Dropdown(
241
+ choices=leaderboard.model_tags,
242
+ value="All",
243
+ label="Filter by Tag"
244
+ )
245
+
246
+ benchmark_filter = gr.Dropdown(
247
+ choices=[("all", "All Benchmarks")],
248
+ value="all",
249
+ label="Filter by Benchmark"
250
+ )
251
+
252
+ refresh_button = gr.Button("Refresh Leaderboard")
253
+
254
+ with gr.Column(scale=2):
255
+ chart_type = gr.Radio(
256
+ choices=["bar", "scatter"],
257
+ value="bar",
258
+ label="Chart Type"
259
+ )
260
+
261
+ view_type = gr.Radio(
262
+ choices=["Table", "Chart", "Dashboard"],
263
+ value="Table",
264
+ label="View Type"
265
+ )
266
+
267
+ # Table view
268
+ leaderboard_table = gr.Dataframe(
269
+ headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
270
+ label="Leaderboard",
271
+ visible=True
272
+ )
273
+
274
+ # Chart view
275
+ with gr.Row(visible=False) as chart_view:
276
+ performance_chart = gr.Plot(label="Performance Chart")
277
+
278
+ # Dashboard view
279
+ with gr.Row(visible=False) as dashboard_view:
280
+ with gr.Column(scale=2):
281
+ dashboard_performance_chart = gr.Plot(label="Performance Comparison")
282
+
283
+ with gr.Column(scale=1):
284
+ with gr.Row():
285
+ tag_distribution_chart = gr.Plot(label="Model Distribution")
286
+
287
+ with gr.Row():
288
+ benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
289
+
290
+ # Event handlers
291
+ def refresh_benchmarks():
292
+ benchmarks = db_manager.get_benchmarks()
293
+
294
+ # Format for dropdown
295
+ choices = [("all", "All Benchmarks")]
296
+ choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
297
+
298
+ return gr.update(choices=choices)
299
+
300
+ def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
301
+ # Get leaderboard data
302
+ if benchmark_id == "all":
303
+ benchmark_id = None
304
+ else:
305
+ benchmark_id = int(benchmark_id)
306
+
307
+ df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
308
+
309
+ # Format for display
310
+ display_df = leaderboard.format_leaderboard_for_display(df)
311
+
312
+ # Create charts
313
+ perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
314
+ tag_chart = leaderboard.create_tag_distribution_chart(df)
315
+ benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
316
+
317
+ # Update visibility based on view type
318
+ table_visible = view_type_val == "Table"
319
+ chart_visible = view_type_val == "Chart"
320
+ dashboard_visible = view_type_val == "Dashboard"
321
+
322
+ return (
323
+ display_df,
324
+ perf_chart,
325
+ perf_chart, # Same chart for both views
326
+ tag_chart,
327
+ benchmark_chart,
328
+ gr.update(visible=table_visible),
329
+ gr.update(visible=chart_visible),
330
+ gr.update(visible=dashboard_visible)
331
+ )
332
+
333
+ # Connect event handlers
334
+ refresh_button.click(
335
+ fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
336
+ inputs=[tag_filter, benchmark_filter, chart_type, view_type],
337
+ outputs=[
338
+ leaderboard_table,
339
+ performance_chart,
340
+ dashboard_performance_chart,
341
+ tag_distribution_chart,
342
+ benchmark_comparison_chart,
343
+ leaderboard_table,
344
+ chart_view,
345
+ dashboard_view
346
+ ]
347
+ )
348
+
349
+ view_type.change(
350
+ fn=lambda view_t: (
351
+ gr.update(visible=view_t == "Table"),
352
+ gr.update(visible=view_t == "Chart"),
353
+ gr.update(visible=view_t == "Dashboard")
354
+ ),
355
+ inputs=[view_type],
356
+ outputs=[leaderboard_table, chart_view, dashboard_view]
357
+ )
358
+
359
+ # Initialize on load
360
+ leaderboard_ui.load(
361
+ fn=refresh_benchmarks,
362
+ inputs=[],
363
+ outputs=[benchmark_filter]
364
+ )
365
+
366
+ leaderboard_ui.load(
367
+ fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
368
+ inputs=[],
369
+ outputs=[
370
+ leaderboard_table,
371
+ performance_chart,
372
+ dashboard_performance_chart,
373
+ tag_distribution_chart,
374
+ benchmark_comparison_chart,
375
+ leaderboard_table,
376
+ chart_view,
377
+ dashboard_view
378
+ ]
379
+ )
380
+
381
+ return leaderboard_ui
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.4.0
2
+ huggingface-hub>=0.27.1
3
+ datasets>=2.14.5
4
+ transformers>=4.35.2
5
+ torch>=2.0.0
6
+ pandas>=2.0.0
7
+ numpy>=1.24.2
8
+ plotly>=5.13.0
9
+ APScheduler>=3.10.1
10
+ tqdm>=4.65.0
11
+ requests>=2.28.2
12
+ python-dateutil>=2.8.2
sample_benchmarks.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sample benchmarks initialization for Dynamic Highscores system.
3
+
4
+ This script adds sample benchmarks to the database to provide initial options for users.
5
+ """
6
+
7
+ from database_schema import init_db
8
+
9
+ def add_sample_benchmarks():
10
+ """Add sample benchmarks to the database."""
11
+ # Initialize database
12
+ db = init_db()
13
+
14
+ # Sample benchmarks to add
15
+ sample_benchmarks = [
16
+ {
17
+ "name": "MMLU (Massive Multitask Language Understanding)",
18
+ "dataset_id": "cais/mmlu",
19
+ "description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
20
+ "metrics": {"accuracy": 1.0, "consistency": 1.0}
21
+ },
22
+ {
23
+ "name": "HumanEval (Code Generation)",
24
+ "dataset_id": "openai/humaneval",
25
+ "description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
26
+ "metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
27
+ },
28
+ {
29
+ "name": "HellaSwag (Commonsense Reasoning)",
30
+ "dataset_id": "hellaswag",
31
+ "description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
32
+ "metrics": {"accuracy": 1.0}
33
+ },
34
+ {
35
+ "name": "GSM8K (Grade School Math)",
36
+ "dataset_id": "gsm8k",
37
+ "description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
38
+ "metrics": {"accuracy": 1.0, "correct_steps": 1.0}
39
+ },
40
+ {
41
+ "name": "TruthfulQA",
42
+ "dataset_id": "truthful_qa",
43
+ "description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
44
+ "metrics": {"accuracy": 1.0, "truthfulness": 1.0}
45
+ }
46
+ ]
47
+
48
+ # Add each benchmark to the database
49
+ for benchmark in sample_benchmarks:
50
+ benchmark_id = db.add_benchmark(
51
+ name=benchmark["name"],
52
+ dataset_id=benchmark["dataset_id"],
53
+ description=benchmark["description"],
54
+ metrics=benchmark["metrics"]
55
+ )
56
+
57
+ print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
58
+
59
+ # Close database connection
60
+ db.close()
61
+
62
+ return len(sample_benchmarks)
63
+
64
+ if __name__ == "__main__":
65
+ num_added = add_sample_benchmarks()
66
+ print(f"Added {num_added} sample benchmarks to the database.")
space.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deployment configuration for Dynamic Highscores on HuggingFace Spaces.
3
+
4
+ This file configures the application for deployment on HuggingFace Spaces.
5
+ """
6
+
7
+ sdk_version: 3.0.0
8
+ app_file: app.py
9
+ models:
10
+ - huggingface-hub
11
+ - transformers
12
+ - datasets
13
+ - torch
14
+ - gradio
15
+ - pandas
16
+ - plotly
17
+ - apscheduler
18
+ - tqdm
19
+ - requests
20
+ - python-dateutil
21
+ - numpy
22
+ python_version: 3.10.12
23
+ hf_oauth: true
test_app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for Dynamic Highscores application.
3
+
4
+ This script tests the key functionality of the Dynamic Highscores application
5
+ to ensure everything works as expected before deployment.
6
+ """
7
+
8
+ import os
9
+ import unittest
10
+ import tempfile
11
+ import sqlite3
12
+ from unittest.mock import MagicMock, patch
13
+
14
+ # Import components to test
15
+ from database_schema import DynamicHighscoresDB
16
+ from auth import HuggingFaceAuth
17
+ from benchmark_selection import BenchmarkSelector
18
+ from evaluation_queue import EvaluationQueue
19
+ from leaderboard import Leaderboard
20
+
21
+ class TestDynamicHighscores(unittest.TestCase):
22
+ """Test cases for Dynamic Highscores application."""
23
+
24
+ def setUp(self):
25
+ """Set up test environment."""
26
+ # Create temporary database
27
+ self.db_fd, self.db_path = tempfile.mkstemp()
28
+ self.db = DynamicHighscoresDB(self.db_path)
29
+
30
+ # Mock auth manager
31
+ self.auth_manager = HuggingFaceAuth(self.db)
32
+
33
+ # Mock components
34
+ self.benchmark_selector = BenchmarkSelector(self.db, self.auth_manager)
35
+ self.evaluation_queue = EvaluationQueue(self.db, self.auth_manager)
36
+ self.leaderboard = Leaderboard(self.db)
37
+
38
+ def tearDown(self):
39
+ """Clean up test environment."""
40
+ os.close(self.db_fd)
41
+ os.unlink(self.db_path)
42
+
43
+ def test_database_schema(self):
44
+ """Test database schema creation."""
45
+ # Check if tables were created
46
+ conn = sqlite3.connect(self.db_path)
47
+ cursor = conn.cursor()
48
+
49
+ # Get list of tables
50
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
51
+ tables = cursor.fetchall()
52
+ table_names = [table[0] for table in tables]
53
+
54
+ # Check if all expected tables exist
55
+ expected_tables = ['users', 'benchmarks', 'models', 'evaluations', 'queue']
56
+ for table in expected_tables:
57
+ self.assertIn(table, table_names)
58
+
59
+ conn.close()
60
+
61
+ def test_user_management(self):
62
+ """Test user management functionality."""
63
+ # Add a test user
64
+ user_id = self.db.add_user("test_user", "test_hf_id", False)
65
+ self.assertIsNotNone(user_id)
66
+
67
+ # Add an admin user
68
+ admin_id = self.db.add_user("admin_user", "admin_hf_id", True)
69
+ self.assertIsNotNone(admin_id)
70
+
71
+ # Test submission limits
72
+ self.assertTrue(self.db.can_submit_today(user_id))
73
+ self.db.update_submission_date(user_id)
74
+ self.assertFalse(self.db.can_submit_today(user_id))
75
+
76
+ # Admin should always be able to submit
77
+ self.assertTrue(self.db.can_submit_today(admin_id))
78
+
79
+ def test_benchmark_management(self):
80
+ """Test benchmark management functionality."""
81
+ # Add a test benchmark
82
+ benchmark_id = self.db.add_benchmark(
83
+ name="Test Benchmark",
84
+ dataset_id="test/dataset",
85
+ description="Test description",
86
+ metrics={"accuracy": 1.0}
87
+ )
88
+ self.assertIsNotNone(benchmark_id)
89
+
90
+ # Get benchmarks
91
+ benchmarks = self.db.get_benchmarks()
92
+ self.assertEqual(len(benchmarks), 1)
93
+ self.assertEqual(benchmarks[0]["name"], "Test Benchmark")
94
+
95
+ def test_model_management(self):
96
+ """Test model management functionality."""
97
+ # Add a test user
98
+ user_id = self.db.add_user("test_user", "test_hf_id", False)
99
+
100
+ # Add a test model
101
+ model_id = self.db.add_model(
102
+ name="Test Model",
103
+ hf_model_id="test/model",
104
+ user_id=user_id,
105
+ tag="Reasoning",
106
+ parameters="7B",
107
+ description="Test model description"
108
+ )
109
+ self.assertIsNotNone(model_id)
110
+
111
+ # Get models
112
+ models = self.db.get_models()
113
+ self.assertEqual(len(models), 1)
114
+ self.assertEqual(models[0]["name"], "Test Model")
115
+
116
+ # Get models by tag
117
+ models = self.db.get_models(tag="Reasoning")
118
+ self.assertEqual(len(models), 1)
119
+ self.assertEqual(models[0]["tag"], "Reasoning")
120
+
121
+ def test_evaluation_management(self):
122
+ """Test evaluation management functionality."""
123
+ # Add a test user
124
+ user_id = self.db.add_user("test_user", "test_hf_id", False)
125
+
126
+ # Add a test model
127
+ model_id = self.db.add_model(
128
+ name="Test Model",
129
+ hf_model_id="test/model",
130
+ user_id=user_id,
131
+ tag="Reasoning"
132
+ )
133
+
134
+ # Add a test benchmark
135
+ benchmark_id = self.db.add_benchmark(
136
+ name="Test Benchmark",
137
+ dataset_id="test/dataset"
138
+ )
139
+
140
+ # Add a test evaluation
141
+ evaluation_id = self.db.add_evaluation(
142
+ model_id=model_id,
143
+ benchmark_id=benchmark_id
144
+ )
145
+ self.assertIsNotNone(evaluation_id)
146
+
147
+ # Update evaluation status
148
+ self.db.update_evaluation_status(
149
+ evaluation_id=evaluation_id,
150
+ status="running"
151
+ )
152
+
153
+ # Get next in queue
154
+ next_eval = self.db.get_next_in_queue()
155
+ self.assertIsNotNone(next_eval)
156
+ self.assertEqual(next_eval["evaluation_id"], evaluation_id)
157
+
158
+ # Complete evaluation
159
+ self.db.update_evaluation_status(
160
+ evaluation_id=evaluation_id,
161
+ status="completed",
162
+ results={"accuracy": 0.85},
163
+ score=85.0
164
+ )
165
+
166
+ # Get evaluation results
167
+ results = self.db.get_evaluation_results()
168
+ self.assertEqual(len(results), 1)
169
+ self.assertEqual(results[0]["score"], 85.0)
170
+
171
+ def test_leaderboard(self):
172
+ """Test leaderboard functionality."""
173
+ # Add test data
174
+ user_id = self.db.add_user("test_user", "test_hf_id", False)
175
+
176
+ # Add models with different tags
177
+ model1_id = self.db.add_model(
178
+ name="Model 1",
179
+ hf_model_id="test/model1",
180
+ user_id=user_id,
181
+ tag="Reasoning"
182
+ )
183
+
184
+ model2_id = self.db.add_model(
185
+ name="Model 2",
186
+ hf_model_id="test/model2",
187
+ user_id=user_id,
188
+ tag="Coding"
189
+ )
190
+
191
+ # Add a benchmark
192
+ benchmark_id = self.db.add_benchmark(
193
+ name="Test Benchmark",
194
+ dataset_id="test/dataset"
195
+ )
196
+
197
+ # Add evaluations
198
+ eval1_id = self.db.add_evaluation(
199
+ model_id=model1_id,
200
+ benchmark_id=benchmark_id
201
+ )
202
+
203
+ eval2_id = self.db.add_evaluation(
204
+ model_id=model2_id,
205
+ benchmark_id=benchmark_id
206
+ )
207
+
208
+ # Complete evaluations
209
+ self.db.update_evaluation_status(
210
+ evaluation_id=eval1_id,
211
+ status="completed",
212
+ results={"accuracy": 0.9},
213
+ score=90.0
214
+ )
215
+
216
+ self.db.update_evaluation_status(
217
+ evaluation_id=eval2_id,
218
+ status="completed",
219
+ results={"accuracy": 0.8},
220
+ score=80.0
221
+ )
222
+
223
+ # Get leaderboard data
224
+ df = self.leaderboard.get_leaderboard_data()
225
+ self.assertEqual(len(df), 2)
226
+
227
+ # Test filtering by tag
228
+ df_reasoning = self.leaderboard.get_leaderboard_data(tag="Reasoning")
229
+ self.assertEqual(len(df_reasoning), 1)
230
+ self.assertEqual(df_reasoning.iloc[0]["score"], 90.0)
231
+
232
+ df_coding = self.leaderboard.get_leaderboard_data(tag="Coding")
233
+ self.assertEqual(len(df_coding), 1)
234
+ self.assertEqual(df_coding.iloc[0]["score"], 80.0)
235
+
236
+ if __name__ == "__main__":
237
+ unittest.main()
todo.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dynamic Highscores - Todo List
2
+
3
+ ## Analysis and Planning
4
+ - [x] Extract and analyze uploaded framework files
5
+ - [x] Examine leaderboard component structure and functionality
6
+ - [x] Examine dashboard component structure and functionality
7
+ - [x] Analyze requirements.txt files for dependencies
8
+
9
+ ## Database Schema Design
10
+ - [x] Design schema for user authentication and tracking
11
+ - [x] Design schema for benchmark datasets
12
+ - [x] Design schema for model submissions and evaluations
13
+ - [x] Design schema for tagging system (Merge, Agent, Reasoning, Coding, etc.)
14
+ - [x] Design schema for daily submission limits
15
+
16
+ ## User Authentication System
17
+ - [x] Implement HuggingFace login integration
18
+ - [x] Create user profile management
19
+ - [x] Implement special privileges for admin account
20
+
21
+ ## Benchmark Selection Interface
22
+ - [x] Create interface for browsing HuggingFace datasets
23
+ - [x] Implement dataset loading functionality
24
+ - [x] Create dataset preview and selection UI
25
+
26
+ ## Model Evaluation Queue System
27
+ - [x] Implement CPU-only evaluation system
28
+ - [x] Create queue management for benchmark submissions
29
+ - [x] Implement daily submission limit (1 per day per user)
30
+ - [x] Add admin override for submission limits
31
+
32
+ ## Leaderboard with Filtering
33
+ - [x] Implement unified leaderboard for all models
34
+ - [x] Add tag-based filtering (Merge, Agent, Reasoning, Coding)
35
+ - [x] Implement sorting and searching functionality
36
+ - [x] Create visualization components for benchmark results
37
+
38
+ ## Integration
39
+ - [x] Combine dashboard and leaderboard components
40
+ - [x] Create unified UI with consistent styling
41
+ - [x] Implement navigation between different sections
42
+ - [x] Ensure proper data flow between components
43
+
44
+ ## Testing and Deployment
45
+ - [x] Test user authentication flow
46
+ - [x] Test benchmark selection and submission
47
+ - [x] Test leaderboard filtering and visualization
48
+ - [x] Prepare for deployment on HuggingFace Spaces