Upload 16 files
Browse files- app.py +284 -0
- auth.py +326 -0
- benchmark_selection.py +573 -0
- database_schema.py +483 -0
- evaluation_queue.py +1122 -0
- leaderboard.py +396 -0
- model_config.py +874 -0
- model_configs/gemma.json +11 -0
- model_configs/llama.json +10 -0
- model_configs/mistral.json +10 -0
- requirements.txt +12 -0
- sample_benchmarks.py +72 -0
- space.yaml +30 -0
- test_app.py +237 -0
- todo.md +48 -0
app.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main application for Dynamic Highscores system.
|
3 |
+
|
4 |
+
This file integrates all components into a unified application.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import gradio as gr
|
9 |
+
import threading
|
10 |
+
import time
|
11 |
+
from database_schema import DynamicHighscoresDB
|
12 |
+
from auth import HuggingFaceAuth
|
13 |
+
from benchmark_selection import BenchmarkSelector, create_benchmark_selection_ui
|
14 |
+
from evaluation_queue import EvaluationQueue, create_model_submission_ui
|
15 |
+
from leaderboard import Leaderboard, create_leaderboard_ui
|
16 |
+
from sample_benchmarks import add_sample_benchmarks
|
17 |
+
|
18 |
+
# Initialize components in main thread
|
19 |
+
db = DynamicHighscoresDB()
|
20 |
+
auth_manager = HuggingFaceAuth(db)
|
21 |
+
benchmark_selector = BenchmarkSelector(db, auth_manager)
|
22 |
+
evaluation_queue = EvaluationQueue(db, auth_manager)
|
23 |
+
leaderboard = Leaderboard(db)
|
24 |
+
|
25 |
+
# Initialize sample benchmarks if none exist
|
26 |
+
print("Checking for existing benchmarks...")
|
27 |
+
benchmarks = db.get_benchmarks()
|
28 |
+
if not benchmarks or len(benchmarks) == 0:
|
29 |
+
print("No benchmarks found. Adding sample benchmarks...")
|
30 |
+
try:
|
31 |
+
# Make sure the database path is clear
|
32 |
+
print(f"Database path: {db.db_path}")
|
33 |
+
|
34 |
+
# Import and call the function directly
|
35 |
+
num_added = add_sample_benchmarks()
|
36 |
+
print(f"Added {num_added} sample benchmarks.")
|
37 |
+
except Exception as e:
|
38 |
+
print(f"Error adding sample benchmarks: {str(e)}")
|
39 |
+
# Try direct DB insertion as fallback
|
40 |
+
try:
|
41 |
+
print("Attempting direct benchmark insertion...")
|
42 |
+
db.add_benchmark(
|
43 |
+
name="MMLU (Massive Multitask Language Understanding)",
|
44 |
+
dataset_id="cais/mmlu",
|
45 |
+
description="Tests knowledge across 57 subjects"
|
46 |
+
)
|
47 |
+
print("Added fallback benchmark.")
|
48 |
+
except Exception as inner_e:
|
49 |
+
print(f"Fallback insertion failed: {str(inner_e)}")
|
50 |
+
else:
|
51 |
+
print(f"Found {len(benchmarks)} existing benchmarks.")
|
52 |
+
|
53 |
+
# Custom CSS with theme awareness
|
54 |
+
css = """
|
55 |
+
/* Theme-adaptive colored info box */
|
56 |
+
.info-text {
|
57 |
+
background-color: rgba(53, 130, 220, 0.1);
|
58 |
+
padding: 12px;
|
59 |
+
border-radius: 8px;
|
60 |
+
border-left: 4px solid #3498db;
|
61 |
+
margin: 12px 0;
|
62 |
+
}
|
63 |
+
|
64 |
+
/* High-contrast text for elements - works in light and dark themes */
|
65 |
+
.info-text, .header, .footer, .tab-content,
|
66 |
+
button, input, textarea, select, option,
|
67 |
+
.gradio-container *, .markdown-text {
|
68 |
+
color: var(--text-color, inherit) !important;
|
69 |
+
}
|
70 |
+
|
71 |
+
/* Container styling */
|
72 |
+
.container {
|
73 |
+
max-width: 1200px;
|
74 |
+
margin: 0 auto;
|
75 |
+
}
|
76 |
+
|
77 |
+
/* Header styling */
|
78 |
+
.header {
|
79 |
+
text-align: center;
|
80 |
+
margin-bottom: 20px;
|
81 |
+
font-weight: bold;
|
82 |
+
font-size: 24px;
|
83 |
+
}
|
84 |
+
|
85 |
+
/* Footer styling */
|
86 |
+
.footer {
|
87 |
+
text-align: center;
|
88 |
+
margin-top: 40px;
|
89 |
+
padding: 20px;
|
90 |
+
border-top: 1px solid var(--border-color-primary, #eee);
|
91 |
+
}
|
92 |
+
|
93 |
+
/* Login section styling */
|
94 |
+
.login-section {
|
95 |
+
padding: 10px;
|
96 |
+
margin-bottom: 15px;
|
97 |
+
border-radius: 8px;
|
98 |
+
background-color: rgba(250, 250, 250, 0.1);
|
99 |
+
text-align: center;
|
100 |
+
}
|
101 |
+
|
102 |
+
/* Login button styling */
|
103 |
+
.login-button {
|
104 |
+
background-color: #4CAF50 !important;
|
105 |
+
color: white !important;
|
106 |
+
font-weight: bold;
|
107 |
+
}
|
108 |
+
|
109 |
+
/* Force high contrast on specific input areas */
|
110 |
+
input[type="text"], input[type="password"], textarea {
|
111 |
+
background-color: var(--background-fill-primary) !important;
|
112 |
+
color: var(--body-text-color) !important;
|
113 |
+
}
|
114 |
+
|
115 |
+
/* Force text visibility in multiple contexts */
|
116 |
+
.gradio-markdown p, .gradio-markdown h1, .gradio-markdown h2,
|
117 |
+
.gradio-markdown h3, .gradio-markdown h4, .gradio-markdown li {
|
118 |
+
color: var(--body-text-color) !important;
|
119 |
+
}
|
120 |
+
|
121 |
+
/* Fix dark mode text visibility */
|
122 |
+
@media (prefers-color-scheme: dark) {
|
123 |
+
input, textarea, select {
|
124 |
+
color: #ffffff !important;
|
125 |
+
}
|
126 |
+
|
127 |
+
::placeholder {
|
128 |
+
color: rgba(255, 255, 255, 0.5) !important;
|
129 |
+
}
|
130 |
+
}
|
131 |
+
"""
|
132 |
+
|
133 |
+
# JavaScript login implementation
|
134 |
+
def js_login_script():
|
135 |
+
space_host = os.environ.get("SPACE_HOST", "localhost:7860")
|
136 |
+
redirect_uri = f"https://{space_host}"
|
137 |
+
|
138 |
+
return f"""
|
139 |
+
<script src="https://unpkg.com/@huggingface/[email protected]/dist/index.umd.min.js"></script>
|
140 |
+
<script>
|
141 |
+
(async function() {{
|
142 |
+
const HfHub = window.HfHub;
|
143 |
+
try {{
|
144 |
+
// Check if we're returning from OAuth redirect
|
145 |
+
const oauthResult = await HfHub.oauthHandleRedirectIfPresent();
|
146 |
+
|
147 |
+
if (oauthResult) {{
|
148 |
+
console.log("User logged in:", oauthResult);
|
149 |
+
|
150 |
+
// Store the user info in localStorage
|
151 |
+
localStorage.setItem("hf_user", JSON.stringify(oauthResult.userInfo));
|
152 |
+
localStorage.setItem("hf_token", oauthResult.accessToken);
|
153 |
+
|
154 |
+
// Update the UI to show logged in state
|
155 |
+
document.getElementById("login-status").textContent = "Logged in as: " + oauthResult.userInfo.name;
|
156 |
+
document.getElementById("login-button").style.display = "none";
|
157 |
+
|
158 |
+
// Refresh the page to update server-side state
|
159 |
+
setTimeout(() => window.location.reload(), 1000);
|
160 |
+
}}
|
161 |
+
}} catch (error) {{
|
162 |
+
console.error("OAuth error:", error);
|
163 |
+
}}
|
164 |
+
|
165 |
+
// Check if user is already logged in from localStorage
|
166 |
+
const storedUser = localStorage.getItem("hf_user");
|
167 |
+
if (storedUser) {{
|
168 |
+
try {{
|
169 |
+
const userInfo = JSON.parse(storedUser);
|
170 |
+
document.getElementById("login-status").textContent = "Logged in as: " + userInfo.name;
|
171 |
+
document.getElementById("login-button").style.display = "none";
|
172 |
+
}} catch (e) {{
|
173 |
+
console.error("Error parsing stored user:", e);
|
174 |
+
}}
|
175 |
+
}}
|
176 |
+
|
177 |
+
// Setup login button
|
178 |
+
document.getElementById("login-button").addEventListener("click", async function() {{
|
179 |
+
try {{
|
180 |
+
const loginUrl = await HfHub.oauthLoginUrl({{
|
181 |
+
redirectUrl: "{redirect_uri}",
|
182 |
+
scopes: ["openid", "profile"]
|
183 |
+
}});
|
184 |
+
window.location.href = loginUrl;
|
185 |
+
}} catch (error) {{
|
186 |
+
console.error("Error generating login URL:", error);
|
187 |
+
alert("Error starting login process. Please try again.");
|
188 |
+
}}
|
189 |
+
}});
|
190 |
+
}})();
|
191 |
+
</script>
|
192 |
+
"""
|
193 |
+
|
194 |
+
# Simple manual authentication check
|
195 |
+
def check_user(request: gr.Request):
|
196 |
+
if request:
|
197 |
+
username = request.headers.get("HF-User")
|
198 |
+
if username:
|
199 |
+
# User is logged in via HF Spaces
|
200 |
+
print(f"User logged in: {username}")
|
201 |
+
user = db.get_user_by_username(username)
|
202 |
+
if not user:
|
203 |
+
# Create user if they don't exist
|
204 |
+
print(f"Creating new user: {username}")
|
205 |
+
is_admin = (username == "Quazim0t0")
|
206 |
+
db.add_user(username, username, is_admin)
|
207 |
+
user = db.get_user_by_username(username)
|
208 |
+
return username
|
209 |
+
return None
|
210 |
+
|
211 |
+
# Start evaluation queue worker
|
212 |
+
def start_queue_worker():
|
213 |
+
# Wait a moment to ensure app is initialized
|
214 |
+
time.sleep(2)
|
215 |
+
try:
|
216 |
+
print("Starting evaluation queue worker...")
|
217 |
+
evaluation_queue.start_worker()
|
218 |
+
except Exception as e:
|
219 |
+
print(f"Error starting queue worker: {e}")
|
220 |
+
|
221 |
+
# Create Gradio app
|
222 |
+
with gr.Blocks(css=css, title="Dynamic Highscores") as app:
|
223 |
+
# State to track user
|
224 |
+
user_state = gr.State(None)
|
225 |
+
|
226 |
+
# Login section
|
227 |
+
with gr.Row(elem_classes=["login-section"]):
|
228 |
+
with gr.Column():
|
229 |
+
gr.HTML("""
|
230 |
+
<div style="display: flex; justify-content: space-between; align-items: center;">
|
231 |
+
<div id="login-status">Not logged in</div>
|
232 |
+
<button id="login-button" style="padding: 8px 16px; background-color: #4CAF50; color: white; border: none; border-radius: 4px; cursor: pointer;">Login with HuggingFace</button>
|
233 |
+
</div>
|
234 |
+
""")
|
235 |
+
|
236 |
+
# Add the JS login script
|
237 |
+
gr.HTML(js_login_script())
|
238 |
+
|
239 |
+
gr.Markdown("# 🏆 Dynamic Highscores", elem_classes=["header"])
|
240 |
+
gr.Markdown("""
|
241 |
+
*Not Active yet, Check back soon!* Welcome to Dynamic Highscores - a community benchmark platform for evaluating and comparing language models.
|
242 |
+
|
243 |
+
- **Add your own benchmarks** from HuggingFace datasets
|
244 |
+
- **Submit your models** for CPU-only evaluation
|
245 |
+
- **Compare performance** across different models and benchmarks
|
246 |
+
- **Filter results** by model type (Merge, Agent, Reasoning, Coding, etc.)
|
247 |
+
""", elem_classes=["info-text"])
|
248 |
+
|
249 |
+
# Main tabs
|
250 |
+
with gr.Tabs() as tabs:
|
251 |
+
with gr.TabItem("📊 Leaderboard", id=0):
|
252 |
+
leaderboard_ui = create_leaderboard_ui(leaderboard, db)
|
253 |
+
|
254 |
+
with gr.TabItem("🚀 Submit Model", id=1):
|
255 |
+
submission_ui = create_model_submission_ui(evaluation_queue, auth_manager, db)
|
256 |
+
|
257 |
+
with gr.TabItem("🔍 Benchmarks", id=2):
|
258 |
+
benchmark_ui = create_benchmark_selection_ui(benchmark_selector, auth_manager)
|
259 |
+
|
260 |
+
gr.Markdown("""
|
261 |
+
### About Dynamic Highscores
|
262 |
+
|
263 |
+
This platform allows users to select benchmarks from HuggingFace datasets and evaluate models against them.
|
264 |
+
Each user can submit one benchmark per day (admin users are exempt from this limit).
|
265 |
+
All evaluations run on CPU only to ensure fair comparisons.
|
266 |
+
|
267 |
+
Created by Quazim0t0
|
268 |
+
""", elem_classes=["footer"])
|
269 |
+
|
270 |
+
# Check login on page load
|
271 |
+
app.load(
|
272 |
+
fn=check_user,
|
273 |
+
inputs=[],
|
274 |
+
outputs=[user_state]
|
275 |
+
)
|
276 |
+
|
277 |
+
# Launch the app
|
278 |
+
if __name__ == "__main__":
|
279 |
+
# Start queue worker in a separate thread
|
280 |
+
queue_thread = threading.Thread(target=start_queue_worker)
|
281 |
+
queue_thread.daemon = True
|
282 |
+
queue_thread.start()
|
283 |
+
|
284 |
+
app.launch()
|
auth.py
ADDED
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Authentication module for Dynamic Highscores system.
|
3 |
+
|
4 |
+
This module handles user authentication with HuggingFace,
|
5 |
+
user session management, and access control.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import time
|
11 |
+
import requests
|
12 |
+
import gradio as gr
|
13 |
+
from huggingface_hub import HfApi, login
|
14 |
+
from functools import wraps
|
15 |
+
|
16 |
+
class HuggingFaceAuth:
|
17 |
+
"""Authentication manager for HuggingFace integration."""
|
18 |
+
|
19 |
+
def __init__(self, db_manager):
|
20 |
+
"""Initialize the authentication manager.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
db_manager: Database manager instance for user storage
|
24 |
+
"""
|
25 |
+
self.db_manager = db_manager
|
26 |
+
self.hf_api = HfApi()
|
27 |
+
self.admin_username = os.environ.get("ADMIN_USERNAME", "Quazim0t0")
|
28 |
+
self.running_in_space = 'SPACE_ID' in os.environ
|
29 |
+
|
30 |
+
def login_user(self, token):
|
31 |
+
"""Log in a user with their HuggingFace token.
|
32 |
+
|
33 |
+
Args:
|
34 |
+
token: HuggingFace API token
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
dict: User information if login successful, None otherwise
|
38 |
+
"""
|
39 |
+
try:
|
40 |
+
# Validate token with HuggingFace
|
41 |
+
login(token=token, add_to_git_credential=False)
|
42 |
+
|
43 |
+
# Get user info from HuggingFace
|
44 |
+
user_info = self.hf_api.whoami(token=token)
|
45 |
+
|
46 |
+
if not user_info:
|
47 |
+
return None
|
48 |
+
|
49 |
+
# Check if user exists in our database, create if not
|
50 |
+
username = user_info.get("name", user_info.get("fullname", ""))
|
51 |
+
hf_user_id = user_info.get("id", "")
|
52 |
+
|
53 |
+
if not hf_user_id:
|
54 |
+
return None
|
55 |
+
|
56 |
+
# Check if this is the admin account
|
57 |
+
is_admin = (username == self.admin_username)
|
58 |
+
|
59 |
+
# Add or get user from database
|
60 |
+
user_id = self.db_manager.add_user(username, hf_user_id, is_admin)
|
61 |
+
|
62 |
+
# Get complete user info from database
|
63 |
+
user = self.db_manager.get_user(hf_user_id)
|
64 |
+
|
65 |
+
if user:
|
66 |
+
# Add token to user info for session only (not stored in database)
|
67 |
+
user['token'] = token
|
68 |
+
return user
|
69 |
+
|
70 |
+
return None
|
71 |
+
except Exception as e:
|
72 |
+
print(f"Login error: {e}")
|
73 |
+
return None
|
74 |
+
|
75 |
+
def check_login(self, request: gr.Request):
|
76 |
+
"""Check if a user is logged in from a Gradio request.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
request: Gradio request object
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
dict: User information if logged in, None otherwise
|
83 |
+
"""
|
84 |
+
if not request:
|
85 |
+
return None
|
86 |
+
|
87 |
+
# First, check if we're in a HuggingFace Space with OAuth
|
88 |
+
if self.running_in_space:
|
89 |
+
# Check for HF-User header from Space OAuth
|
90 |
+
username = request.headers.get("HF-User")
|
91 |
+
if username:
|
92 |
+
# Check if user exists in our database, create if not
|
93 |
+
user = self.db_manager.get_user_by_username(username)
|
94 |
+
if not user:
|
95 |
+
# Create a new user
|
96 |
+
is_admin = (username == self.admin_username)
|
97 |
+
user_id = self.db_manager.add_user(username, username, is_admin)
|
98 |
+
user = self.db_manager.get_user_by_username(username)
|
99 |
+
return user
|
100 |
+
|
101 |
+
# Fallback to token-based auth for local development
|
102 |
+
token = request.cookies.get("hf_token")
|
103 |
+
|
104 |
+
if not token:
|
105 |
+
return None
|
106 |
+
|
107 |
+
try:
|
108 |
+
# Validate token with HuggingFace
|
109 |
+
user_info = self.hf_api.whoami(token=token)
|
110 |
+
|
111 |
+
if not user_info:
|
112 |
+
return None
|
113 |
+
|
114 |
+
# Get user from database
|
115 |
+
hf_user_id = user_info.get("id", "")
|
116 |
+
user = self.db_manager.get_user(hf_user_id)
|
117 |
+
|
118 |
+
if user:
|
119 |
+
# Add token to user info for session only (not stored in database)
|
120 |
+
user['token'] = token
|
121 |
+
return user
|
122 |
+
|
123 |
+
return None
|
124 |
+
except Exception as e:
|
125 |
+
print(f"Check login error: {e}")
|
126 |
+
return None
|
127 |
+
|
128 |
+
def require_login(self, func):
|
129 |
+
"""Decorator to require login for a function.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
func: Function to decorate
|
133 |
+
|
134 |
+
Returns:
|
135 |
+
Function: Decorated function that requires login
|
136 |
+
"""
|
137 |
+
@wraps(func)
|
138 |
+
def wrapper(*args, **kwargs):
|
139 |
+
# Find the request argument
|
140 |
+
request = None
|
141 |
+
for arg in args:
|
142 |
+
if isinstance(arg, gr.Request):
|
143 |
+
request = arg
|
144 |
+
break
|
145 |
+
|
146 |
+
if not request and 'request' in kwargs:
|
147 |
+
request = kwargs['request']
|
148 |
+
|
149 |
+
if not request:
|
150 |
+
return "Please log in to access this feature."
|
151 |
+
|
152 |
+
# Check if user is logged in
|
153 |
+
user = self.check_login(request)
|
154 |
+
|
155 |
+
if not user:
|
156 |
+
return "Please log in to access this feature."
|
157 |
+
|
158 |
+
# Add user to kwargs
|
159 |
+
kwargs['user'] = user
|
160 |
+
|
161 |
+
# Call the original function
|
162 |
+
return func(*args, **kwargs)
|
163 |
+
|
164 |
+
return wrapper
|
165 |
+
|
166 |
+
def require_admin(self, func):
|
167 |
+
"""Decorator to require admin privileges for a function.
|
168 |
+
|
169 |
+
Args:
|
170 |
+
func: Function to decorate
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
Function: Decorated function that requires admin privileges
|
174 |
+
"""
|
175 |
+
@wraps(func)
|
176 |
+
def wrapper(*args, **kwargs):
|
177 |
+
# Find the request argument
|
178 |
+
request = None
|
179 |
+
for arg in args:
|
180 |
+
if isinstance(arg, gr.Request):
|
181 |
+
request = arg
|
182 |
+
break
|
183 |
+
|
184 |
+
if not request and 'request' in kwargs:
|
185 |
+
request = kwargs['request']
|
186 |
+
|
187 |
+
if not request:
|
188 |
+
return "Admin access required."
|
189 |
+
|
190 |
+
# Check if user is logged in
|
191 |
+
user = self.check_login(request)
|
192 |
+
|
193 |
+
if not user:
|
194 |
+
return "Admin access required."
|
195 |
+
|
196 |
+
# Check if user is admin
|
197 |
+
if not user.get('is_admin', False):
|
198 |
+
return "Admin access required."
|
199 |
+
|
200 |
+
# Add user to kwargs
|
201 |
+
kwargs['user'] = user
|
202 |
+
|
203 |
+
# Call the original function
|
204 |
+
return func(*args, **kwargs)
|
205 |
+
|
206 |
+
return wrapper
|
207 |
+
|
208 |
+
def can_submit_benchmark(self, user_id):
|
209 |
+
"""Check if a user can submit a benchmark today.
|
210 |
+
|
211 |
+
Args:
|
212 |
+
user_id: User ID to check
|
213 |
+
|
214 |
+
Returns:
|
215 |
+
bool: True if user can submit, False otherwise
|
216 |
+
"""
|
217 |
+
return self.db_manager.can_submit_today(user_id)
|
218 |
+
|
219 |
+
def update_submission_date(self, user_id):
|
220 |
+
"""Update the last submission date for a user.
|
221 |
+
|
222 |
+
Args:
|
223 |
+
user_id: User ID to update
|
224 |
+
"""
|
225 |
+
self.db_manager.update_submission_date(user_id)
|
226 |
+
|
227 |
+
# Authentication UI components
|
228 |
+
def create_login_ui():
|
229 |
+
"""Create the login UI components.
|
230 |
+
|
231 |
+
Returns:
|
232 |
+
tuple: (login_button, logout_button, user_info)
|
233 |
+
"""
|
234 |
+
with gr.Row():
|
235 |
+
with gr.Column(scale=3):
|
236 |
+
# If running in a HuggingFace Space, use their OAuth
|
237 |
+
if 'SPACE_ID' in os.environ:
|
238 |
+
login_button = gr.Button("Login with HuggingFace", visible=False)
|
239 |
+
logout_button = gr.Button("Logout", visible=False)
|
240 |
+
else:
|
241 |
+
# For local development, use token-based login
|
242 |
+
login_button = gr.Button("Login with HuggingFace Token")
|
243 |
+
logout_button = gr.Button("Logout", visible=False)
|
244 |
+
|
245 |
+
with gr.Column(scale=2):
|
246 |
+
user_info = gr.Markdown("Checking login status...")
|
247 |
+
|
248 |
+
return login_button, logout_button, user_info
|
249 |
+
|
250 |
+
def login_handler(auth_manager):
|
251 |
+
"""Handle login button click.
|
252 |
+
|
253 |
+
Args:
|
254 |
+
auth_manager: Authentication manager instance
|
255 |
+
|
256 |
+
Returns:
|
257 |
+
tuple: JS to redirect to login and updated UI visibility
|
258 |
+
"""
|
259 |
+
# This is only used for local development
|
260 |
+
# For HuggingFace Spaces, the built-in OAuth is used
|
261 |
+
return (
|
262 |
+
gr.update(visible=False), # Hide login button
|
263 |
+
gr.update(visible=True), # Show logout button
|
264 |
+
"Redirecting to login...",
|
265 |
+
"""
|
266 |
+
<script>
|
267 |
+
// Open a popup window for token entry
|
268 |
+
function promptForToken() {
|
269 |
+
const token = prompt("Enter your HuggingFace token:");
|
270 |
+
if (token) {
|
271 |
+
// Set the token as a cookie
|
272 |
+
document.cookie = "hf_token=" + token + "; path=/; SameSite=Strict";
|
273 |
+
// Reload the page to apply the token
|
274 |
+
window.location.reload();
|
275 |
+
}
|
276 |
+
}
|
277 |
+
|
278 |
+
// Call the function
|
279 |
+
promptForToken();
|
280 |
+
</script>
|
281 |
+
"""
|
282 |
+
)
|
283 |
+
|
284 |
+
def logout_handler():
|
285 |
+
"""Handle logout button click.
|
286 |
+
|
287 |
+
Returns:
|
288 |
+
tuple: Updated UI components visibility and user info
|
289 |
+
"""
|
290 |
+
# Clear token cookie in JavaScript
|
291 |
+
return (
|
292 |
+
gr.update(visible=True), # Show login button
|
293 |
+
gr.update(visible=False), # Hide logout button
|
294 |
+
"Logged out",
|
295 |
+
"""
|
296 |
+
<script>
|
297 |
+
// Clear the token cookie
|
298 |
+
document.cookie = "hf_token=; path=/; max-age=0; SameSite=Strict";
|
299 |
+
// Reload the page
|
300 |
+
window.location.reload();
|
301 |
+
</script>
|
302 |
+
"""
|
303 |
+
)
|
304 |
+
|
305 |
+
def setup_auth_handlers(login_button, logout_button, user_info, auth_manager):
|
306 |
+
"""Set up event handlers for authentication UI components.
|
307 |
+
|
308 |
+
Args:
|
309 |
+
login_button: Login button component
|
310 |
+
logout_button: Logout button component
|
311 |
+
user_info: User info component
|
312 |
+
auth_manager: Authentication manager instance
|
313 |
+
"""
|
314 |
+
# Only add event handlers if not running in a HuggingFace Space
|
315 |
+
if 'SPACE_ID' not in os.environ:
|
316 |
+
login_button.click(
|
317 |
+
fn=lambda: login_handler(auth_manager),
|
318 |
+
inputs=[],
|
319 |
+
outputs=[login_button, logout_button, user_info, gr.HTML()]
|
320 |
+
)
|
321 |
+
|
322 |
+
logout_button.click(
|
323 |
+
fn=logout_handler,
|
324 |
+
inputs=[],
|
325 |
+
outputs=[login_button, logout_button, user_info, gr.HTML()]
|
326 |
+
)
|
benchmark_selection.py
ADDED
@@ -0,0 +1,573 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Benchmark selection module for Dynamic Highscores system.
|
3 |
+
|
4 |
+
This module handles browsing, selection, and loading of HuggingFace datasets
|
5 |
+
to be used as benchmarks for model evaluation.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import gradio as gr
|
11 |
+
from huggingface_hub import HfApi, list_datasets
|
12 |
+
from datasets import load_dataset, get_dataset_config_names
|
13 |
+
from functools import partial
|
14 |
+
|
15 |
+
class BenchmarkSelector:
|
16 |
+
"""Benchmark selection manager for HuggingFace datasets."""
|
17 |
+
|
18 |
+
def __init__(self, db_manager, auth_manager):
|
19 |
+
"""Initialize the benchmark selector.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
db_manager: Database manager instance for benchmark storage
|
23 |
+
auth_manager: Authentication manager instance for access control
|
24 |
+
"""
|
25 |
+
self.db_manager = db_manager
|
26 |
+
self.auth_manager = auth_manager
|
27 |
+
self.hf_api = HfApi()
|
28 |
+
|
29 |
+
# Common benchmark categories for filtering
|
30 |
+
self.categories = [
|
31 |
+
"All",
|
32 |
+
"Text Generation",
|
33 |
+
"Question Answering",
|
34 |
+
"Summarization",
|
35 |
+
"Translation",
|
36 |
+
"Classification",
|
37 |
+
"Code Generation",
|
38 |
+
"Reasoning",
|
39 |
+
"Math"
|
40 |
+
]
|
41 |
+
|
42 |
+
# Common metrics for different benchmark types
|
43 |
+
self.metric_templates = {
|
44 |
+
"Text Generation": ["bleu", "rouge", "meteor"],
|
45 |
+
"Question Answering": ["exact_match", "f1"],
|
46 |
+
"Summarization": ["rouge1", "rouge2", "rougeL"],
|
47 |
+
"Translation": ["bleu", "ter"],
|
48 |
+
"Classification": ["accuracy", "f1", "precision", "recall"],
|
49 |
+
"Code Generation": ["exact_match", "pass@k", "functional_correctness"],
|
50 |
+
"Reasoning": ["accuracy", "consistency"],
|
51 |
+
"Math": ["accuracy", "correct_steps"]
|
52 |
+
}
|
53 |
+
|
54 |
+
def search_datasets(self, query, category="All", limit=50):
|
55 |
+
"""Search for datasets on HuggingFace.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
query: Search query string
|
59 |
+
category: Dataset category to filter by
|
60 |
+
limit: Maximum number of results to return
|
61 |
+
|
62 |
+
Returns:
|
63 |
+
list: List of dataset information dictionaries
|
64 |
+
"""
|
65 |
+
try:
|
66 |
+
# Apply category filter if not "All"
|
67 |
+
filter_str = None
|
68 |
+
if category != "All":
|
69 |
+
filter_str = f"task_categories:{category}"
|
70 |
+
|
71 |
+
# Search for datasets
|
72 |
+
datasets = list_datasets(
|
73 |
+
search=query,
|
74 |
+
filter=filter_str,
|
75 |
+
limit=limit
|
76 |
+
)
|
77 |
+
|
78 |
+
# Format results
|
79 |
+
results = []
|
80 |
+
for dataset in datasets:
|
81 |
+
# Handle cases where description might be missing
|
82 |
+
dataset_description = ""
|
83 |
+
if hasattr(dataset, 'description') and dataset.description:
|
84 |
+
dataset_description = dataset.description[:200] + "..." if len(dataset.description) > 200 else dataset.description
|
85 |
+
|
86 |
+
# Handle cases where tags might be missing
|
87 |
+
dataset_tags = []
|
88 |
+
if hasattr(dataset, 'tags'):
|
89 |
+
dataset_tags = dataset.tags
|
90 |
+
|
91 |
+
# Handle cases where downloads might be missing
|
92 |
+
dataset_downloads = 0
|
93 |
+
if hasattr(dataset, 'downloads'):
|
94 |
+
dataset_downloads = dataset.downloads
|
95 |
+
|
96 |
+
# Handle cases where author might be missing
|
97 |
+
dataset_author = ""
|
98 |
+
if hasattr(dataset, 'author'):
|
99 |
+
dataset_author = dataset.author
|
100 |
+
|
101 |
+
results.append({
|
102 |
+
"id": dataset.id,
|
103 |
+
"name": dataset.id.split("/")[-1],
|
104 |
+
"author": dataset_author,
|
105 |
+
"description": dataset_description,
|
106 |
+
"tags": dataset_tags,
|
107 |
+
"downloads": dataset_downloads
|
108 |
+
})
|
109 |
+
|
110 |
+
return results
|
111 |
+
except Exception as e:
|
112 |
+
print(f"Dataset search error: {e}")
|
113 |
+
return []
|
114 |
+
|
115 |
+
def get_dataset_info(self, dataset_id):
|
116 |
+
"""Get detailed information about a dataset.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
dataset_id: HuggingFace dataset ID
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
dict: Dataset information
|
123 |
+
"""
|
124 |
+
try:
|
125 |
+
# Get dataset info from HuggingFace
|
126 |
+
dataset_info = self.hf_api.dataset_info(dataset_id)
|
127 |
+
|
128 |
+
# Get available configurations
|
129 |
+
configs = []
|
130 |
+
try:
|
131 |
+
configs = get_dataset_config_names(dataset_id)
|
132 |
+
except Exception as e:
|
133 |
+
print(f"Error getting dataset configs: {e}")
|
134 |
+
|
135 |
+
# Handle missing attributes safely
|
136 |
+
dataset_description = ""
|
137 |
+
if hasattr(dataset_info, 'description'):
|
138 |
+
dataset_description = dataset_info.description
|
139 |
+
|
140 |
+
dataset_citation = ""
|
141 |
+
if hasattr(dataset_info, 'citation'):
|
142 |
+
dataset_citation = dataset_info.citation
|
143 |
+
|
144 |
+
dataset_tags = []
|
145 |
+
if hasattr(dataset_info, 'tags'):
|
146 |
+
dataset_tags = dataset_info.tags
|
147 |
+
|
148 |
+
dataset_downloads = 0
|
149 |
+
if hasattr(dataset_info, 'downloads'):
|
150 |
+
dataset_downloads = dataset_info.downloads
|
151 |
+
|
152 |
+
dataset_author = ""
|
153 |
+
if hasattr(dataset_info, 'author'):
|
154 |
+
dataset_author = dataset_info.author
|
155 |
+
|
156 |
+
# Format result
|
157 |
+
result = {
|
158 |
+
"id": dataset_info.id,
|
159 |
+
"name": dataset_info.id.split("/")[-1],
|
160 |
+
"author": dataset_author,
|
161 |
+
"description": dataset_description,
|
162 |
+
"citation": dataset_citation,
|
163 |
+
"configs": configs,
|
164 |
+
"tags": dataset_tags,
|
165 |
+
"downloads": dataset_downloads
|
166 |
+
}
|
167 |
+
|
168 |
+
return result
|
169 |
+
except Exception as e:
|
170 |
+
print(f"Dataset info error: {e}")
|
171 |
+
return None
|
172 |
+
|
173 |
+
def load_dataset_sample(self, dataset_id, config=None, split="train", sample_size=5):
|
174 |
+
"""Load a sample from a dataset.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
dataset_id: HuggingFace dataset ID
|
178 |
+
config: Dataset configuration name
|
179 |
+
split: Dataset split to sample from
|
180 |
+
sample_size: Number of samples to load
|
181 |
+
|
182 |
+
Returns:
|
183 |
+
dict: Dataset sample information
|
184 |
+
"""
|
185 |
+
try:
|
186 |
+
# Load dataset
|
187 |
+
if config:
|
188 |
+
dataset = load_dataset(dataset_id, config, split=split)
|
189 |
+
else:
|
190 |
+
dataset = load_dataset(dataset_id, split=split)
|
191 |
+
|
192 |
+
# Get sample
|
193 |
+
if len(dataset) > sample_size:
|
194 |
+
sample = dataset.select(range(sample_size))
|
195 |
+
else:
|
196 |
+
sample = dataset
|
197 |
+
|
198 |
+
# Get features
|
199 |
+
features = list(sample.features.keys())
|
200 |
+
|
201 |
+
# Convert sample to list of dictionaries
|
202 |
+
sample_data = []
|
203 |
+
for item in sample:
|
204 |
+
sample_item = {}
|
205 |
+
for key in features:
|
206 |
+
# Convert non-serializable values to strings
|
207 |
+
if isinstance(item[key], (list, dict)):
|
208 |
+
sample_item[key] = str(item[key])
|
209 |
+
else:
|
210 |
+
sample_item[key] = item[key]
|
211 |
+
sample_data.append(sample_item)
|
212 |
+
|
213 |
+
# Format result
|
214 |
+
result = {
|
215 |
+
"id": dataset_id,
|
216 |
+
"config": config,
|
217 |
+
"split": split,
|
218 |
+
"features": features,
|
219 |
+
"sample": sample_data,
|
220 |
+
"total_size": len(dataset)
|
221 |
+
}
|
222 |
+
|
223 |
+
return result
|
224 |
+
except Exception as e:
|
225 |
+
print(f"Dataset sample error: {e}")
|
226 |
+
return None
|
227 |
+
|
228 |
+
def add_benchmark(self, dataset_id, name=None, description=None, metrics=None, config=None):
|
229 |
+
"""Add a dataset as a benchmark.
|
230 |
+
|
231 |
+
Args:
|
232 |
+
dataset_id: HuggingFace dataset ID
|
233 |
+
name: Benchmark name (defaults to dataset name)
|
234 |
+
description: Benchmark description (defaults to dataset description)
|
235 |
+
metrics: Metrics to use for evaluation
|
236 |
+
config: Dataset configuration to use
|
237 |
+
|
238 |
+
Returns:
|
239 |
+
int: Benchmark ID if successful, None otherwise
|
240 |
+
"""
|
241 |
+
try:
|
242 |
+
# Get dataset info if name or description not provided
|
243 |
+
if not name or not description:
|
244 |
+
dataset_info = self.get_dataset_info(dataset_id)
|
245 |
+
if not dataset_info:
|
246 |
+
return None
|
247 |
+
|
248 |
+
if not name:
|
249 |
+
name = dataset_info["name"]
|
250 |
+
|
251 |
+
if not description:
|
252 |
+
description = dataset_info["description"]
|
253 |
+
|
254 |
+
# Format dataset ID with config if provided
|
255 |
+
full_dataset_id = dataset_id
|
256 |
+
if config:
|
257 |
+
full_dataset_id = f"{dataset_id}:{config}"
|
258 |
+
|
259 |
+
# Add benchmark to database
|
260 |
+
benchmark_id = self.db_manager.add_benchmark(
|
261 |
+
name=name,
|
262 |
+
dataset_id=full_dataset_id,
|
263 |
+
description=description,
|
264 |
+
metrics=metrics
|
265 |
+
)
|
266 |
+
|
267 |
+
return benchmark_id
|
268 |
+
except Exception as e:
|
269 |
+
print(f"Add benchmark error: {e}")
|
270 |
+
return None
|
271 |
+
|
272 |
+
def get_benchmarks(self):
|
273 |
+
"""Get all available benchmarks.
|
274 |
+
|
275 |
+
Returns:
|
276 |
+
list: List of benchmark information dictionaries
|
277 |
+
"""
|
278 |
+
return self.db_manager.get_benchmarks()
|
279 |
+
|
280 |
+
# Benchmark selection UI components
|
281 |
+
def create_benchmark_selection_ui(benchmark_selector, auth_manager):
|
282 |
+
"""Create the benchmark selection UI components.
|
283 |
+
|
284 |
+
Args:
|
285 |
+
benchmark_selector: Benchmark selector instance
|
286 |
+
auth_manager: Authentication manager instance
|
287 |
+
|
288 |
+
Returns:
|
289 |
+
gr.Blocks: Gradio Blocks component with benchmark selection UI
|
290 |
+
"""
|
291 |
+
with gr.Blocks() as benchmark_ui:
|
292 |
+
gr.Markdown("## 📊 Dynamic Highscores Benchmark Selection")
|
293 |
+
gr.Markdown("""
|
294 |
+
### Add your own datasets from HuggingFace as benchmarks!
|
295 |
+
|
296 |
+
You can add any dataset from HuggingFace to use as a benchmark for evaluating models.
|
297 |
+
Simply enter the dataset ID (e.g., 'squad', 'glue', 'hellaswag') and add it as a benchmark.
|
298 |
+
|
299 |
+
Other users will be able to select your added benchmarks for their model evaluations.
|
300 |
+
""", elem_classes=["info-text"])
|
301 |
+
|
302 |
+
with gr.Tabs() as tabs:
|
303 |
+
with gr.TabItem("➕ Add New Benchmark", id=0):
|
304 |
+
with gr.Row():
|
305 |
+
with gr.Column(scale=3):
|
306 |
+
search_input = gr.Textbox(
|
307 |
+
placeholder="Search for datasets on HuggingFace...",
|
308 |
+
label="Search",
|
309 |
+
show_label=False
|
310 |
+
)
|
311 |
+
|
312 |
+
with gr.Column(scale=1):
|
313 |
+
category_dropdown = gr.Dropdown(
|
314 |
+
choices=benchmark_selector.categories,
|
315 |
+
value="All",
|
316 |
+
label="Category"
|
317 |
+
)
|
318 |
+
|
319 |
+
with gr.Column(scale=1):
|
320 |
+
search_button = gr.Button("Search")
|
321 |
+
|
322 |
+
dataset_results = gr.Dataframe(
|
323 |
+
headers=["Name", "Author", "Description", "Downloads"],
|
324 |
+
datatype=["str", "str", "str", "number"],
|
325 |
+
label="Search Results",
|
326 |
+
interactive=True
|
327 |
+
)
|
328 |
+
|
329 |
+
with gr.Row():
|
330 |
+
with gr.Column(scale=2):
|
331 |
+
dataset_id_input = gr.Textbox(
|
332 |
+
placeholder="Enter HuggingFace dataset ID (e.g., 'squad', 'glue', 'hellaswag')",
|
333 |
+
label="Dataset ID",
|
334 |
+
info="You can enter any dataset ID from HuggingFace"
|
335 |
+
)
|
336 |
+
|
337 |
+
with gr.Column(scale=1):
|
338 |
+
view_button = gr.Button("View Dataset Details")
|
339 |
+
|
340 |
+
with gr.Accordion("Dataset Details", open=False):
|
341 |
+
dataset_info = gr.JSON(label="Dataset Information")
|
342 |
+
|
343 |
+
with gr.Row():
|
344 |
+
config_dropdown = gr.Dropdown(
|
345 |
+
label="Configuration",
|
346 |
+
choices=[],
|
347 |
+
interactive=True
|
348 |
+
)
|
349 |
+
|
350 |
+
split_dropdown = gr.Dropdown(
|
351 |
+
label="Split",
|
352 |
+
choices=["train", "validation", "test"],
|
353 |
+
value="train",
|
354 |
+
interactive=True
|
355 |
+
)
|
356 |
+
|
357 |
+
sample_button = gr.Button("Load Sample")
|
358 |
+
|
359 |
+
sample_data = gr.Dataframe(
|
360 |
+
label="Sample Data",
|
361 |
+
interactive=False
|
362 |
+
)
|
363 |
+
|
364 |
+
gr.Markdown("### Add this dataset as a benchmark")
|
365 |
+
with gr.Row():
|
366 |
+
with gr.Column(scale=2):
|
367 |
+
benchmark_name = gr.Textbox(
|
368 |
+
placeholder="Enter a name for this benchmark",
|
369 |
+
label="Benchmark Name",
|
370 |
+
info="A descriptive name for this benchmark"
|
371 |
+
)
|
372 |
+
|
373 |
+
benchmark_description = gr.Textbox(
|
374 |
+
placeholder="Enter a description for this benchmark",
|
375 |
+
label="Description",
|
376 |
+
info="Explain what this benchmark evaluates",
|
377 |
+
lines=3
|
378 |
+
)
|
379 |
+
|
380 |
+
with gr.Column(scale=1):
|
381 |
+
metrics_input = gr.CheckboxGroup(
|
382 |
+
label="Evaluation Metrics",
|
383 |
+
choices=[],
|
384 |
+
interactive=True,
|
385 |
+
info="Select metrics to use for evaluation"
|
386 |
+
)
|
387 |
+
|
388 |
+
with gr.Row():
|
389 |
+
add_benchmark_button = gr.Button("Add as Benchmark", size="lg", variant="primary")
|
390 |
+
|
391 |
+
benchmark_status = gr.Markdown("")
|
392 |
+
|
393 |
+
with gr.TabItem("📋 Available Benchmarks", id=1):
|
394 |
+
gr.Markdown("### Benchmarks available for model evaluation")
|
395 |
+
gr.Markdown("These benchmarks can be selected when submitting models for evaluation.")
|
396 |
+
|
397 |
+
with gr.Row():
|
398 |
+
refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
|
399 |
+
reload_sample_benchmarks_button = gr.Button("Reload Sample Benchmarks", variant="secondary")
|
400 |
+
|
401 |
+
reload_status = gr.Markdown("")
|
402 |
+
|
403 |
+
benchmarks_container = gr.Column()
|
404 |
+
with benchmarks_container:
|
405 |
+
no_benchmarks_message = gr.Markdown(
|
406 |
+
"### No Datasets Added Yet\n\nBe the first to add a benchmark dataset! Go to the 'Add New Benchmark' tab to add a dataset from HuggingFace.",
|
407 |
+
visible=True
|
408 |
+
)
|
409 |
+
|
410 |
+
my_benchmarks = gr.Dataframe(
|
411 |
+
headers=["ID", "Name", "Dataset", "Description"],
|
412 |
+
label="Available Benchmarks",
|
413 |
+
interactive=True,
|
414 |
+
visible=False
|
415 |
+
)
|
416 |
+
|
417 |
+
# Event handlers
|
418 |
+
def search_datasets_handler(query, category):
|
419 |
+
if not query:
|
420 |
+
return None
|
421 |
+
|
422 |
+
results = benchmark_selector.search_datasets(query, category)
|
423 |
+
|
424 |
+
# Format for dataframe
|
425 |
+
formatted_results = []
|
426 |
+
for result in results:
|
427 |
+
formatted_results.append([
|
428 |
+
result["name"],
|
429 |
+
result["author"],
|
430 |
+
result["description"],
|
431 |
+
result["downloads"]
|
432 |
+
])
|
433 |
+
|
434 |
+
return formatted_results
|
435 |
+
|
436 |
+
def view_dataset_handler(dataset_id):
|
437 |
+
if not dataset_id:
|
438 |
+
return None, [], None
|
439 |
+
|
440 |
+
dataset_info = benchmark_selector.get_dataset_info(dataset_id)
|
441 |
+
|
442 |
+
if not dataset_info:
|
443 |
+
return None, [], None
|
444 |
+
|
445 |
+
# Update metrics based on dataset tags
|
446 |
+
metrics = []
|
447 |
+
for category, category_metrics in benchmark_selector.metric_templates.items():
|
448 |
+
if any(tag.lower() in [t.lower() for t in dataset_info["tags"]] for tag in category.lower().split()):
|
449 |
+
metrics.extend(category_metrics)
|
450 |
+
|
451 |
+
# Remove duplicates
|
452 |
+
metrics = list(set(metrics))
|
453 |
+
|
454 |
+
return dataset_info, dataset_info["configs"], gr.update(choices=metrics)
|
455 |
+
|
456 |
+
def load_sample_handler(dataset_id, config, split):
|
457 |
+
if not dataset_id:
|
458 |
+
return None
|
459 |
+
|
460 |
+
sample_info = benchmark_selector.load_dataset_sample(
|
461 |
+
dataset_id,
|
462 |
+
config=config if config else None,
|
463 |
+
split=split
|
464 |
+
)
|
465 |
+
|
466 |
+
if not sample_info:
|
467 |
+
return None
|
468 |
+
|
469 |
+
return sample_info["sample"]
|
470 |
+
|
471 |
+
def add_benchmark_handler(dataset_id, config, name, description, metrics, request: gr.Request):
|
472 |
+
if not dataset_id:
|
473 |
+
return "Please enter a dataset ID from HuggingFace."
|
474 |
+
|
475 |
+
# Check if user is logged in
|
476 |
+
user = auth_manager.check_login(request)
|
477 |
+
|
478 |
+
if not user:
|
479 |
+
return "Please log in to add benchmarks."
|
480 |
+
|
481 |
+
# Add benchmark
|
482 |
+
benchmark_id = benchmark_selector.add_benchmark(
|
483 |
+
dataset_id=dataset_id,
|
484 |
+
name=name if name else None,
|
485 |
+
description=description if description else None,
|
486 |
+
metrics=metrics if metrics else None,
|
487 |
+
config=config if config else None
|
488 |
+
)
|
489 |
+
|
490 |
+
if benchmark_id:
|
491 |
+
return f"✅ Benchmark added successfully with ID: {benchmark_id}\n\nThis dataset is now available for model evaluation. You can view it in the 'Available Benchmarks' tab."
|
492 |
+
else:
|
493 |
+
return "❌ Failed to add benchmark. Please check the dataset ID and try again."
|
494 |
+
|
495 |
+
def get_benchmarks_handler(request: gr.Request):
|
496 |
+
# Check if user is logged in
|
497 |
+
user = auth_manager.check_login(request)
|
498 |
+
|
499 |
+
if not user:
|
500 |
+
return gr.update(visible=True), gr.update(visible=False), None
|
501 |
+
|
502 |
+
# Get benchmarks
|
503 |
+
benchmarks = benchmark_selector.get_benchmarks()
|
504 |
+
|
505 |
+
# If no benchmarks, show message
|
506 |
+
if not benchmarks or len(benchmarks) == 0:
|
507 |
+
return gr.update(visible=True), gr.update(visible=False), None
|
508 |
+
|
509 |
+
# Format for dataframe
|
510 |
+
formatted_benchmarks = []
|
511 |
+
for benchmark in benchmarks:
|
512 |
+
formatted_benchmarks.append([
|
513 |
+
benchmark["id"],
|
514 |
+
benchmark["name"],
|
515 |
+
benchmark["dataset_id"],
|
516 |
+
benchmark["description"]
|
517 |
+
])
|
518 |
+
|
519 |
+
return gr.update(visible=False), gr.update(visible=True), formatted_benchmarks
|
520 |
+
|
521 |
+
def reload_sample_benchmarks_handler():
|
522 |
+
try:
|
523 |
+
from sample_benchmarks import add_sample_benchmarks
|
524 |
+
num_added = add_sample_benchmarks()
|
525 |
+
return f"✅ Successfully reloaded {num_added} sample benchmarks."
|
526 |
+
except Exception as e:
|
527 |
+
return f"❌ Error reloading benchmarks: {str(e)}"
|
528 |
+
|
529 |
+
# Connect event handlers
|
530 |
+
search_button.click(
|
531 |
+
fn=search_datasets_handler,
|
532 |
+
inputs=[search_input, category_dropdown],
|
533 |
+
outputs=[dataset_results]
|
534 |
+
)
|
535 |
+
|
536 |
+
view_button.click(
|
537 |
+
fn=view_dataset_handler,
|
538 |
+
inputs=[dataset_id_input],
|
539 |
+
outputs=[dataset_info, config_dropdown, metrics_input]
|
540 |
+
)
|
541 |
+
|
542 |
+
sample_button.click(
|
543 |
+
fn=load_sample_handler,
|
544 |
+
inputs=[dataset_id_input, config_dropdown, split_dropdown],
|
545 |
+
outputs=[sample_data]
|
546 |
+
)
|
547 |
+
|
548 |
+
add_benchmark_button.click(
|
549 |
+
fn=add_benchmark_handler,
|
550 |
+
inputs=[dataset_id_input, config_dropdown, benchmark_name, benchmark_description, metrics_input],
|
551 |
+
outputs=[benchmark_status]
|
552 |
+
)
|
553 |
+
|
554 |
+
refresh_benchmarks_button.click(
|
555 |
+
fn=get_benchmarks_handler,
|
556 |
+
inputs=[],
|
557 |
+
outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
|
558 |
+
)
|
559 |
+
|
560 |
+
reload_sample_benchmarks_button.click(
|
561 |
+
fn=reload_sample_benchmarks_handler,
|
562 |
+
inputs=[],
|
563 |
+
outputs=[reload_status]
|
564 |
+
)
|
565 |
+
|
566 |
+
# Initialize benchmarks on load
|
567 |
+
benchmark_ui.load(
|
568 |
+
fn=get_benchmarks_handler,
|
569 |
+
inputs=[],
|
570 |
+
outputs=[no_benchmarks_message, my_benchmarks, my_benchmarks]
|
571 |
+
)
|
572 |
+
|
573 |
+
return benchmark_ui
|
database_schema.py
ADDED
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Database schema for Dynamic Highscores system.
|
3 |
+
|
4 |
+
This module defines the SQLite database schema for the Dynamic Highscores system,
|
5 |
+
which integrates benchmark selection, model evaluation, and leaderboard functionality.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import sqlite3
|
9 |
+
import os
|
10 |
+
import json
|
11 |
+
import threading
|
12 |
+
from datetime import datetime, timedelta
|
13 |
+
import pandas as pd
|
14 |
+
|
15 |
+
class ThreadLocalDB:
|
16 |
+
"""Thread-local database connection manager."""
|
17 |
+
|
18 |
+
_thread_local = threading.local()
|
19 |
+
|
20 |
+
def __init__(self, db_path):
|
21 |
+
"""Initialize with database path."""
|
22 |
+
self.db_path = db_path
|
23 |
+
|
24 |
+
def get_connection(self):
|
25 |
+
"""Get a thread-local database connection."""
|
26 |
+
if not hasattr(self._thread_local, 'conn') or self._thread_local.conn is None:
|
27 |
+
self._thread_local.conn = sqlite3.connect(self.db_path)
|
28 |
+
self._thread_local.conn.row_factory = sqlite3.Row
|
29 |
+
return self._thread_local.conn
|
30 |
+
|
31 |
+
def get_cursor(self):
|
32 |
+
"""Get a cursor from the thread-local connection."""
|
33 |
+
conn = self.get_connection()
|
34 |
+
if not hasattr(self._thread_local, 'cursor') or self._thread_local.cursor is None:
|
35 |
+
self._thread_local.cursor = conn.cursor()
|
36 |
+
return self._thread_local.cursor
|
37 |
+
|
38 |
+
def close(self):
|
39 |
+
"""Close the thread-local connection if it exists."""
|
40 |
+
if hasattr(self._thread_local, 'conn') and self._thread_local.conn is not None:
|
41 |
+
if hasattr(self._thread_local, 'cursor') and self._thread_local.cursor is not None:
|
42 |
+
self._thread_local.cursor.close()
|
43 |
+
self._thread_local.cursor = None
|
44 |
+
self._thread_local.conn.close()
|
45 |
+
self._thread_local.conn = None
|
46 |
+
|
47 |
+
class DynamicHighscoresDB:
|
48 |
+
"""Database manager for the Dynamic Highscores system."""
|
49 |
+
|
50 |
+
def __init__(self, db_path="dynamic_highscores.db"):
|
51 |
+
"""Initialize the database connection and create tables if they don't exist."""
|
52 |
+
self.db_path = db_path
|
53 |
+
self.thread_local_db = ThreadLocalDB(db_path)
|
54 |
+
self.create_tables()
|
55 |
+
|
56 |
+
def get_conn(self):
|
57 |
+
"""Get the thread-local database connection."""
|
58 |
+
return self.thread_local_db.get_connection()
|
59 |
+
|
60 |
+
def get_cursor(self):
|
61 |
+
"""Get the thread-local database cursor."""
|
62 |
+
return self.thread_local_db.get_cursor()
|
63 |
+
|
64 |
+
def close(self):
|
65 |
+
"""Close the thread-local database connection."""
|
66 |
+
self.thread_local_db.close()
|
67 |
+
|
68 |
+
def create_tables(self):
|
69 |
+
"""Create all necessary tables if they don't exist."""
|
70 |
+
cursor = self.get_cursor()
|
71 |
+
conn = self.get_conn()
|
72 |
+
|
73 |
+
# Users table - stores user information
|
74 |
+
cursor.execute('''
|
75 |
+
CREATE TABLE IF NOT EXISTS users (
|
76 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
77 |
+
username TEXT UNIQUE NOT NULL,
|
78 |
+
hf_user_id TEXT UNIQUE NOT NULL,
|
79 |
+
is_admin BOOLEAN DEFAULT 0,
|
80 |
+
last_submission_date TEXT,
|
81 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
82 |
+
)
|
83 |
+
''')
|
84 |
+
|
85 |
+
# Benchmarks table - stores information about available benchmarks
|
86 |
+
cursor.execute('''
|
87 |
+
CREATE TABLE IF NOT EXISTS benchmarks (
|
88 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
89 |
+
name TEXT NOT NULL,
|
90 |
+
dataset_id TEXT NOT NULL,
|
91 |
+
description TEXT,
|
92 |
+
metrics TEXT, -- JSON string of metrics
|
93 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
94 |
+
)
|
95 |
+
''')
|
96 |
+
|
97 |
+
# Models table - stores information about submitted models
|
98 |
+
cursor.execute('''
|
99 |
+
CREATE TABLE IF NOT EXISTS models (
|
100 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
101 |
+
name TEXT NOT NULL,
|
102 |
+
hf_model_id TEXT NOT NULL,
|
103 |
+
user_id INTEGER NOT NULL,
|
104 |
+
tag TEXT NOT NULL, -- One of: Merge, Agent, Reasoning, Coding, etc.
|
105 |
+
parameters TEXT, -- Number of parameters (can be NULL)
|
106 |
+
description TEXT,
|
107 |
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
108 |
+
FOREIGN KEY (user_id) REFERENCES users (id),
|
109 |
+
UNIQUE (hf_model_id, user_id)
|
110 |
+
)
|
111 |
+
''')
|
112 |
+
|
113 |
+
# Evaluations table - stores evaluation results
|
114 |
+
cursor.execute('''
|
115 |
+
CREATE TABLE IF NOT EXISTS evaluations (
|
116 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
117 |
+
model_id INTEGER NOT NULL,
|
118 |
+
benchmark_id INTEGER NOT NULL,
|
119 |
+
status TEXT NOT NULL, -- pending, running, completed, failed
|
120 |
+
results TEXT, -- JSON string of results
|
121 |
+
score REAL, -- Overall score (can be NULL)
|
122 |
+
submitted_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
123 |
+
started_at TEXT,
|
124 |
+
completed_at TEXT,
|
125 |
+
FOREIGN KEY (model_id) REFERENCES models (id),
|
126 |
+
FOREIGN KEY (benchmark_id) REFERENCES benchmarks (id)
|
127 |
+
)
|
128 |
+
''')
|
129 |
+
|
130 |
+
# Queue table - stores evaluation queue
|
131 |
+
cursor.execute('''
|
132 |
+
CREATE TABLE IF NOT EXISTS queue (
|
133 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
134 |
+
evaluation_id INTEGER NOT NULL,
|
135 |
+
priority INTEGER DEFAULT 0, -- Higher number = higher priority
|
136 |
+
added_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
137 |
+
FOREIGN KEY (evaluation_id) REFERENCES evaluations (id)
|
138 |
+
)
|
139 |
+
''')
|
140 |
+
|
141 |
+
conn.commit()
|
142 |
+
|
143 |
+
# User management methods
|
144 |
+
def add_user(self, username, hf_user_id, is_admin=False):
|
145 |
+
"""Add a new user to the database."""
|
146 |
+
cursor = self.get_cursor()
|
147 |
+
conn = self.get_conn()
|
148 |
+
|
149 |
+
try:
|
150 |
+
cursor.execute(
|
151 |
+
"INSERT INTO users (username, hf_user_id, is_admin) VALUES (?, ?, ?)",
|
152 |
+
(username, hf_user_id, is_admin)
|
153 |
+
)
|
154 |
+
conn.commit()
|
155 |
+
return cursor.lastrowid
|
156 |
+
except sqlite3.IntegrityError:
|
157 |
+
# User already exists
|
158 |
+
cursor.execute(
|
159 |
+
"SELECT id FROM users WHERE hf_user_id = ?",
|
160 |
+
(hf_user_id,)
|
161 |
+
)
|
162 |
+
row = cursor.fetchone()
|
163 |
+
return row[0] if row else None
|
164 |
+
|
165 |
+
def get_user(self, hf_user_id):
|
166 |
+
"""Get user information by HuggingFace user ID."""
|
167 |
+
cursor = self.get_cursor()
|
168 |
+
|
169 |
+
cursor.execute(
|
170 |
+
"SELECT * FROM users WHERE hf_user_id = ?",
|
171 |
+
(hf_user_id,)
|
172 |
+
)
|
173 |
+
row = cursor.fetchone()
|
174 |
+
return dict(row) if row else None
|
175 |
+
|
176 |
+
def get_user_by_username(self, username):
|
177 |
+
"""Get user information by username."""
|
178 |
+
cursor = self.get_cursor()
|
179 |
+
|
180 |
+
cursor.execute(
|
181 |
+
"SELECT * FROM users WHERE username = ?",
|
182 |
+
(username,)
|
183 |
+
)
|
184 |
+
row = cursor.fetchone()
|
185 |
+
return dict(row) if row else None
|
186 |
+
|
187 |
+
def can_submit_today(self, user_id):
|
188 |
+
"""Check if a user can submit a benchmark evaluation today."""
|
189 |
+
cursor = self.get_cursor()
|
190 |
+
|
191 |
+
cursor.execute(
|
192 |
+
"SELECT is_admin, last_submission_date FROM users WHERE id = ?",
|
193 |
+
(user_id,)
|
194 |
+
)
|
195 |
+
result = cursor.fetchone()
|
196 |
+
|
197 |
+
if not result:
|
198 |
+
return False
|
199 |
+
|
200 |
+
user_data = dict(result)
|
201 |
+
|
202 |
+
# Admin can always submit
|
203 |
+
if user_data['is_admin']:
|
204 |
+
return True
|
205 |
+
|
206 |
+
# If no previous submission, user can submit
|
207 |
+
if not user_data['last_submission_date']:
|
208 |
+
return True
|
209 |
+
|
210 |
+
# Check if last submission was before today
|
211 |
+
last_date = datetime.fromisoformat(user_data['last_submission_date'])
|
212 |
+
today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
213 |
+
|
214 |
+
return last_date < today
|
215 |
+
|
216 |
+
def update_submission_date(self, user_id):
|
217 |
+
"""Update the last submission date for a user."""
|
218 |
+
cursor = self.get_cursor()
|
219 |
+
conn = self.get_conn()
|
220 |
+
|
221 |
+
current_time = datetime.now().isoformat()
|
222 |
+
cursor.execute(
|
223 |
+
"UPDATE users SET last_submission_date = ? WHERE id = ?",
|
224 |
+
(current_time, user_id)
|
225 |
+
)
|
226 |
+
conn.commit()
|
227 |
+
|
228 |
+
# Benchmark management methods
|
229 |
+
def add_benchmark(self, name, dataset_id, description="", metrics=None):
|
230 |
+
"""Add a new benchmark to the database."""
|
231 |
+
cursor = self.get_cursor()
|
232 |
+
conn = self.get_conn()
|
233 |
+
|
234 |
+
if metrics is None:
|
235 |
+
metrics = {}
|
236 |
+
|
237 |
+
metrics_json = json.dumps(metrics)
|
238 |
+
|
239 |
+
try:
|
240 |
+
cursor.execute(
|
241 |
+
"INSERT INTO benchmarks (name, dataset_id, description, metrics) VALUES (?, ?, ?, ?)",
|
242 |
+
(name, dataset_id, description, metrics_json)
|
243 |
+
)
|
244 |
+
conn.commit()
|
245 |
+
return cursor.lastrowid
|
246 |
+
except sqlite3.IntegrityError:
|
247 |
+
# Benchmark already exists with this dataset_id
|
248 |
+
cursor.execute(
|
249 |
+
"SELECT id FROM benchmarks WHERE dataset_id = ?",
|
250 |
+
(dataset_id,)
|
251 |
+
)
|
252 |
+
row = cursor.fetchone()
|
253 |
+
return row[0] if row else None
|
254 |
+
|
255 |
+
def get_benchmarks(self):
|
256 |
+
"""Get all available benchmarks."""
|
257 |
+
cursor = self.get_cursor()
|
258 |
+
|
259 |
+
cursor.execute("SELECT * FROM benchmarks")
|
260 |
+
benchmarks = [dict(row) for row in cursor.fetchall()]
|
261 |
+
|
262 |
+
# Parse metrics JSON
|
263 |
+
for benchmark in benchmarks:
|
264 |
+
if benchmark['metrics']:
|
265 |
+
benchmark['metrics'] = json.loads(benchmark['metrics'])
|
266 |
+
else:
|
267 |
+
benchmark['metrics'] = {}
|
268 |
+
|
269 |
+
return benchmarks
|
270 |
+
|
271 |
+
def get_benchmark(self, benchmark_id):
|
272 |
+
"""Get benchmark information by ID."""
|
273 |
+
cursor = self.get_cursor()
|
274 |
+
|
275 |
+
cursor.execute(
|
276 |
+
"SELECT * FROM benchmarks WHERE id = ?",
|
277 |
+
(benchmark_id,)
|
278 |
+
)
|
279 |
+
row = cursor.fetchone()
|
280 |
+
benchmark = dict(row) if row else None
|
281 |
+
|
282 |
+
if benchmark and benchmark['metrics']:
|
283 |
+
benchmark['metrics'] = json.loads(benchmark['metrics'])
|
284 |
+
|
285 |
+
return benchmark
|
286 |
+
|
287 |
+
# Model management methods
|
288 |
+
def add_model(self, name, hf_model_id, user_id, tag, parameters=None, description=""):
|
289 |
+
"""Add a new model to the database."""
|
290 |
+
cursor = self.get_cursor()
|
291 |
+
conn = self.get_conn()
|
292 |
+
|
293 |
+
try:
|
294 |
+
cursor.execute(
|
295 |
+
"INSERT INTO models (name, hf_model_id, user_id, tag, parameters, description) VALUES (?, ?, ?, ?, ?, ?)",
|
296 |
+
(name, hf_model_id, user_id, tag, parameters, description)
|
297 |
+
)
|
298 |
+
conn.commit()
|
299 |
+
return cursor.lastrowid
|
300 |
+
except sqlite3.IntegrityError:
|
301 |
+
# Model already exists for this user
|
302 |
+
cursor.execute(
|
303 |
+
"SELECT id FROM models WHERE hf_model_id = ? AND user_id = ?",
|
304 |
+
(hf_model_id, user_id)
|
305 |
+
)
|
306 |
+
row = cursor.fetchone()
|
307 |
+
return row[0] if row else None
|
308 |
+
|
309 |
+
def get_models(self, tag=None):
|
310 |
+
"""Get all models, optionally filtered by tag."""
|
311 |
+
cursor = self.get_cursor()
|
312 |
+
|
313 |
+
if tag and tag.lower() != "all":
|
314 |
+
cursor.execute(
|
315 |
+
"SELECT * FROM models WHERE tag = ?",
|
316 |
+
(tag,)
|
317 |
+
)
|
318 |
+
else:
|
319 |
+
cursor.execute("SELECT * FROM models")
|
320 |
+
|
321 |
+
return [dict(row) for row in cursor.fetchall()]
|
322 |
+
|
323 |
+
def get_model(self, model_id):
|
324 |
+
"""Get model information by ID."""
|
325 |
+
cursor = self.get_cursor()
|
326 |
+
|
327 |
+
cursor.execute(
|
328 |
+
"SELECT * FROM models WHERE id = ?",
|
329 |
+
(model_id,)
|
330 |
+
)
|
331 |
+
row = cursor.fetchone()
|
332 |
+
return dict(row) if row else None
|
333 |
+
|
334 |
+
# Evaluation management methods
|
335 |
+
def add_evaluation(self, model_id, benchmark_id, priority=0):
|
336 |
+
"""Add a new evaluation to the database and queue."""
|
337 |
+
cursor = self.get_cursor()
|
338 |
+
conn = self.get_conn()
|
339 |
+
|
340 |
+
# First, add the evaluation
|
341 |
+
cursor.execute(
|
342 |
+
"INSERT INTO evaluations (model_id, benchmark_id, status) VALUES (?, ?, 'pending')",
|
343 |
+
(model_id, benchmark_id)
|
344 |
+
)
|
345 |
+
evaluation_id = cursor.lastrowid
|
346 |
+
|
347 |
+
# Then, add it to the queue
|
348 |
+
cursor.execute(
|
349 |
+
"INSERT INTO queue (evaluation_id, priority) VALUES (?, ?)",
|
350 |
+
(evaluation_id, priority)
|
351 |
+
)
|
352 |
+
|
353 |
+
conn.commit()
|
354 |
+
return evaluation_id
|
355 |
+
|
356 |
+
def update_evaluation_status(self, evaluation_id, status, results=None, score=None):
|
357 |
+
"""Update the status of an evaluation."""
|
358 |
+
cursor = self.get_cursor()
|
359 |
+
conn = self.get_conn()
|
360 |
+
|
361 |
+
params = [status, evaluation_id]
|
362 |
+
sql = "UPDATE evaluations SET status = ?"
|
363 |
+
|
364 |
+
if results is not None:
|
365 |
+
sql += ", results = ?"
|
366 |
+
params.insert(1, json.dumps(results))
|
367 |
+
|
368 |
+
if score is not None:
|
369 |
+
sql += ", score = ?"
|
370 |
+
params.insert(1 if results is None else 2, score)
|
371 |
+
|
372 |
+
if status in ['completed', 'failed']:
|
373 |
+
sql += ", completed_at = datetime('now')"
|
374 |
+
elif status == 'running':
|
375 |
+
sql += ", started_at = datetime('now')"
|
376 |
+
|
377 |
+
sql += " WHERE id = ?"
|
378 |
+
|
379 |
+
cursor.execute(sql, params)
|
380 |
+
conn.commit()
|
381 |
+
|
382 |
+
def get_next_in_queue(self):
|
383 |
+
"""Get the next evaluation in the queue."""
|
384 |
+
cursor = self.get_cursor()
|
385 |
+
|
386 |
+
cursor.execute("""
|
387 |
+
SELECT q.*, e.id as evaluation_id, e.model_id, e.benchmark_id, e.status
|
388 |
+
FROM queue q
|
389 |
+
JOIN evaluations e ON q.evaluation_id = e.id
|
390 |
+
WHERE e.status = 'pending'
|
391 |
+
ORDER BY q.priority DESC, q.added_at ASC
|
392 |
+
LIMIT 1
|
393 |
+
""")
|
394 |
+
|
395 |
+
row = cursor.fetchone()
|
396 |
+
return dict(row) if row else None
|
397 |
+
|
398 |
+
def get_evaluation_results(self, model_id=None, benchmark_id=None, tag=None, status=None, limit=None):
|
399 |
+
"""Get evaluation results, optionally filtered by model, benchmark, tag, or status."""
|
400 |
+
cursor = self.get_cursor()
|
401 |
+
|
402 |
+
sql = """
|
403 |
+
SELECT e.id, e.model_id, e.benchmark_id, e.status, e.results, e.score,
|
404 |
+
e.submitted_at, e.started_at, e.completed_at, m.name as model_name, m.tag,
|
405 |
+
b.name as benchmark_name
|
406 |
+
FROM evaluations e
|
407 |
+
JOIN models m ON e.model_id = m.id
|
408 |
+
JOIN benchmarks b ON e.benchmark_id = b.id
|
409 |
+
WHERE 1=1
|
410 |
+
"""
|
411 |
+
|
412 |
+
params = []
|
413 |
+
|
414 |
+
if status:
|
415 |
+
sql += " AND e.status = ?"
|
416 |
+
params.append(status)
|
417 |
+
|
418 |
+
if model_id:
|
419 |
+
sql += " AND e.model_id = ?"
|
420 |
+
params.append(model_id)
|
421 |
+
|
422 |
+
if benchmark_id and benchmark_id != "all" and benchmark_id.lower() != "all":
|
423 |
+
sql += " AND e.benchmark_id = ?"
|
424 |
+
params.append(benchmark_id)
|
425 |
+
|
426 |
+
if tag and tag.lower() != "all":
|
427 |
+
sql += " AND m.tag = ?"
|
428 |
+
params.append(tag)
|
429 |
+
|
430 |
+
sql += " ORDER BY e.submitted_at DESC"
|
431 |
+
|
432 |
+
if limit:
|
433 |
+
sql += " LIMIT ?"
|
434 |
+
params.append(limit)
|
435 |
+
|
436 |
+
cursor.execute(sql, params)
|
437 |
+
results = [dict(row) for row in cursor.fetchall()]
|
438 |
+
|
439 |
+
# Parse results JSON
|
440 |
+
for result in results:
|
441 |
+
if result['results']:
|
442 |
+
try:
|
443 |
+
result['results'] = json.loads(result['results'])
|
444 |
+
except:
|
445 |
+
result['results'] = {}
|
446 |
+
|
447 |
+
return results
|
448 |
+
|
449 |
+
def get_leaderboard_df(self, tag=None, benchmark_id=None):
|
450 |
+
"""Get a pandas DataFrame of the leaderboard, optionally filtered by tag and benchmark."""
|
451 |
+
results = self.get_evaluation_results(tag=tag, benchmark_id=benchmark_id, status="completed")
|
452 |
+
|
453 |
+
if not results:
|
454 |
+
return pd.DataFrame()
|
455 |
+
|
456 |
+
# Create a list of dictionaries for the DataFrame
|
457 |
+
leaderboard_data = []
|
458 |
+
|
459 |
+
for result in results:
|
460 |
+
entry = {
|
461 |
+
'model_name': result['model_name'],
|
462 |
+
'tag': result['tag'],
|
463 |
+
'benchmark_name': result['benchmark_name'],
|
464 |
+
'score': result['score'],
|
465 |
+
'completed_at': result['completed_at']
|
466 |
+
}
|
467 |
+
|
468 |
+
# Add any additional metrics from results
|
469 |
+
if result['results'] and isinstance(result['results'], dict):
|
470 |
+
for key, value in result['results'].items():
|
471 |
+
if isinstance(value, (int, float)) and key not in entry:
|
472 |
+
entry[key] = value
|
473 |
+
|
474 |
+
leaderboard_data.append(entry)
|
475 |
+
|
476 |
+
# Convert to DataFrame
|
477 |
+
df = pd.DataFrame(leaderboard_data)
|
478 |
+
|
479 |
+
# Sort by score (descending)
|
480 |
+
if not df.empty and 'score' in df.columns:
|
481 |
+
df = df.sort_values('score', ascending=False)
|
482 |
+
|
483 |
+
return df
|
evaluation_queue.py
ADDED
@@ -0,0 +1,1122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model evaluation queue system for Dynamic Highscores.
|
3 |
+
|
4 |
+
This module handles the evaluation queue, CPU-only processing,
|
5 |
+
and enforces daily submission limits for users.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import time
|
11 |
+
import threading
|
12 |
+
import queue as queue_module
|
13 |
+
from datetime import datetime, timedelta
|
14 |
+
import gradio as gr
|
15 |
+
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
16 |
+
from datasets import load_dataset
|
17 |
+
import torch
|
18 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
19 |
+
import sqlite3
|
20 |
+
|
21 |
+
class EvaluationQueue:
|
22 |
+
"""Manages the evaluation queue for model benchmarking."""
|
23 |
+
|
24 |
+
def __init__(self, db_manager, auth_manager):
|
25 |
+
"""Initialize the evaluation queue manager.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
db_manager: Database manager instance
|
29 |
+
auth_manager: Authentication manager instance
|
30 |
+
"""
|
31 |
+
self.db_manager = db_manager
|
32 |
+
self.auth_manager = auth_manager
|
33 |
+
self.hf_api = HfApi()
|
34 |
+
self.queue = queue_module.Queue()
|
35 |
+
self.is_processing = False
|
36 |
+
self.worker_thread = None
|
37 |
+
self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
|
38 |
+
self.current_evaluation = None
|
39 |
+
self.progress = 0
|
40 |
+
self.progress_lock = threading.Lock()
|
41 |
+
# Memory limit for models in GB (leave 2GB for system)
|
42 |
+
self.memory_limit_gb = 14.0
|
43 |
+
|
44 |
+
def start_worker(self):
|
45 |
+
"""Start the worker thread for processing the evaluation queue."""
|
46 |
+
if self.worker_thread is None or not self.worker_thread.is_alive():
|
47 |
+
self.is_processing = True
|
48 |
+
self.worker_thread = threading.Thread(target=self._process_queue)
|
49 |
+
self.worker_thread.daemon = True
|
50 |
+
self.worker_thread.start()
|
51 |
+
|
52 |
+
def stop_worker(self):
|
53 |
+
"""Stop the worker thread."""
|
54 |
+
self.is_processing = False
|
55 |
+
if self.worker_thread and self.worker_thread.is_alive():
|
56 |
+
self.worker_thread.join(timeout=1.0)
|
57 |
+
|
58 |
+
def check_model_size(self, model_id):
|
59 |
+
"""Check if a model will fit within RAM limitations.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
model_id: HuggingFace model ID
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
tuple: (will_fit, message)
|
66 |
+
"""
|
67 |
+
try:
|
68 |
+
# Query model info from the HuggingFace API
|
69 |
+
model_info_obj = self.hf_api.model_info(model_id)
|
70 |
+
|
71 |
+
# Initialize total size
|
72 |
+
total_size_gb = 0
|
73 |
+
|
74 |
+
# Try different approaches to get model size based on API response structure
|
75 |
+
if hasattr(model_info_obj, 'safetensors') and model_info_obj.safetensors:
|
76 |
+
# New API format with safetensors dict
|
77 |
+
for file_info in model_info_obj.safetensors.values():
|
78 |
+
if hasattr(file_info, 'size'):
|
79 |
+
total_size_gb += file_info.size / (1024 * 1024 * 1024)
|
80 |
+
elif isinstance(file_info, dict) and 'size' in file_info:
|
81 |
+
total_size_gb += file_info['size'] / (1024 * 1024 * 1024)
|
82 |
+
|
83 |
+
# Fallback to siblings method
|
84 |
+
if total_size_gb == 0 and hasattr(model_info_obj, 'siblings'):
|
85 |
+
for sibling in model_info_obj.siblings:
|
86 |
+
if hasattr(sibling, 'size'):
|
87 |
+
if sibling.rfilename.endswith(('.bin', '.safetensors', '.pt')):
|
88 |
+
total_size_gb += sibling.size / (1024 * 1024 * 1024)
|
89 |
+
elif isinstance(sibling, dict) and 'size' in sibling:
|
90 |
+
if sibling.get('rfilename', '').endswith(('.bin', '.safetensors', '.pt')):
|
91 |
+
total_size_gb += sibling['size'] / (1024 * 1024 * 1024)
|
92 |
+
|
93 |
+
# If we still couldn't determine size, try a reasonable guess based on model name
|
94 |
+
if total_size_gb == 0:
|
95 |
+
# Try to guess from model name (e.g., if it has "7b" in the name)
|
96 |
+
model_name = model_id.lower()
|
97 |
+
size_indicators = {
|
98 |
+
"1b": 1, "2b": 2, "3b": 3, "5b": 5, "7b": 7, "8b": 8,
|
99 |
+
"10b": 10, "13b": 13, "20b": 20, "30b": 30, "65b": 65, "70b": 70
|
100 |
+
}
|
101 |
+
|
102 |
+
for indicator, size in size_indicators.items():
|
103 |
+
if indicator in model_name.replace("-", "").replace("_", ""):
|
104 |
+
total_size_gb = size * 2 # Rough estimate: param count × 2 for size in GB
|
105 |
+
break
|
106 |
+
|
107 |
+
# If we still couldn't determine size, use a default
|
108 |
+
if total_size_gb == 0:
|
109 |
+
# Try direct API method
|
110 |
+
try:
|
111 |
+
print(f"Checking model size with direct method for {model_id}")
|
112 |
+
# Print out the entire structure for debugging
|
113 |
+
print(f"Model info: {model_info_obj.__dict__}")
|
114 |
+
|
115 |
+
# Default to a conservative estimate
|
116 |
+
total_size_gb = 5 # Assume a 5GB model as default
|
117 |
+
except Exception as e:
|
118 |
+
print(f"Direct size check failed: {e}")
|
119 |
+
return True, "Unable to determine model size accurately, but allowing submission with caution"
|
120 |
+
|
121 |
+
# Account for memory overhead
|
122 |
+
estimated_ram_needed = total_size_gb * 1.3 # 30% overhead
|
123 |
+
|
124 |
+
# Check against limit
|
125 |
+
if estimated_ram_needed > self.memory_limit_gb:
|
126 |
+
return False, f"Model is too large (approximately {total_size_gb:.1f}GB, needs {estimated_ram_needed:.1f}GB RAM). Maximum allowed is {self.memory_limit_gb}GB."
|
127 |
+
|
128 |
+
return True, f"Model size check passed ({total_size_gb:.1f}GB, estimated {estimated_ram_needed:.1f}GB RAM usage)"
|
129 |
+
|
130 |
+
except Exception as e:
|
131 |
+
print(f"Model size check error: {e}")
|
132 |
+
# Log more details for debugging
|
133 |
+
import traceback
|
134 |
+
traceback.print_exc()
|
135 |
+
|
136 |
+
# Allow submission with warning
|
137 |
+
return True, f"Warning: Could not verify model size ({str(e)}). Please ensure your model is under {self.memory_limit_gb}GB."
|
138 |
+
|
139 |
+
def _process_queue(self):
|
140 |
+
"""Process the evaluation queue in a separate thread."""
|
141 |
+
while self.is_processing:
|
142 |
+
try:
|
143 |
+
# Get the next evaluation from the database
|
144 |
+
pending_evals = self.db_manager.get_evaluation_results(status="pending")
|
145 |
+
|
146 |
+
if pending_evals:
|
147 |
+
# Sort by priority and added_at
|
148 |
+
next_eval = pending_evals[0]
|
149 |
+
|
150 |
+
# Update status to running
|
151 |
+
self.db_manager.update_evaluation_status(next_eval['id'], 'running')
|
152 |
+
|
153 |
+
# Set current evaluation and reset progress
|
154 |
+
with self.progress_lock:
|
155 |
+
self.current_evaluation = next_eval
|
156 |
+
self.progress = 0
|
157 |
+
|
158 |
+
try:
|
159 |
+
# Get model and benchmark details
|
160 |
+
model_info = self.db_manager.get_model(next_eval['model_id'])
|
161 |
+
benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
|
162 |
+
|
163 |
+
if model_info and benchmark_info:
|
164 |
+
# Check if model will fit in memory
|
165 |
+
will_fit, message = self.check_model_size(model_info['hf_model_id'])
|
166 |
+
|
167 |
+
if not will_fit:
|
168 |
+
raise Exception(f"Model too large for evaluation: {message}")
|
169 |
+
|
170 |
+
# Run the evaluation
|
171 |
+
results = self._run_evaluation(
|
172 |
+
model_info['hf_model_id'],
|
173 |
+
benchmark_info['dataset_id']
|
174 |
+
)
|
175 |
+
|
176 |
+
# Calculate overall score
|
177 |
+
score = self._calculate_overall_score(results)
|
178 |
+
|
179 |
+
# Update status to completed with results
|
180 |
+
self.db_manager.update_evaluation_status(
|
181 |
+
next_eval['id'],
|
182 |
+
'completed',
|
183 |
+
results=results,
|
184 |
+
score=score
|
185 |
+
)
|
186 |
+
else:
|
187 |
+
raise Exception("Model or benchmark not found")
|
188 |
+
except Exception as e:
|
189 |
+
print(f"Evaluation error: {e}")
|
190 |
+
# Update status to failed with error message
|
191 |
+
error_results = {"error": str(e)}
|
192 |
+
self.db_manager.update_evaluation_status(
|
193 |
+
next_eval['id'],
|
194 |
+
'failed',
|
195 |
+
results=error_results
|
196 |
+
)
|
197 |
+
|
198 |
+
# Clear current evaluation
|
199 |
+
with self.progress_lock:
|
200 |
+
self.current_evaluation = None
|
201 |
+
self.progress = 0
|
202 |
+
else:
|
203 |
+
# No evaluations in queue, sleep for a bit
|
204 |
+
time.sleep(5)
|
205 |
+
except Exception as e:
|
206 |
+
print(f"Queue processing error: {e}")
|
207 |
+
time.sleep(5)
|
208 |
+
|
209 |
+
def _run_evaluation(self, model_id, dataset_id):
|
210 |
+
"""Run an evaluation for a model on a benchmark.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
model_id: HuggingFace model ID
|
214 |
+
dataset_id: HuggingFace dataset ID (with optional config)
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
dict: Evaluation results
|
218 |
+
"""
|
219 |
+
# Update progress
|
220 |
+
with self.progress_lock:
|
221 |
+
self.progress = 5 # Starting evaluation
|
222 |
+
|
223 |
+
# Parse dataset ID and config
|
224 |
+
if ":" in dataset_id:
|
225 |
+
dataset_id, config = dataset_id.split(":", 1)
|
226 |
+
else:
|
227 |
+
config = None
|
228 |
+
|
229 |
+
# Update progress
|
230 |
+
with self.progress_lock:
|
231 |
+
self.progress = 10 # Loading dataset
|
232 |
+
|
233 |
+
# Load the dataset
|
234 |
+
try:
|
235 |
+
if config:
|
236 |
+
dataset = load_dataset(dataset_id, config, split="test")
|
237 |
+
else:
|
238 |
+
dataset = load_dataset(dataset_id, split="test")
|
239 |
+
except Exception as e:
|
240 |
+
return {"error": f"Failed to load dataset: {str(e)}"}
|
241 |
+
|
242 |
+
# Update progress
|
243 |
+
with self.progress_lock:
|
244 |
+
self.progress = 20 # Loading model
|
245 |
+
|
246 |
+
try:
|
247 |
+
# Load the model with memory optimization settings
|
248 |
+
device = "cpu"
|
249 |
+
model = AutoModelForCausalLM.from_pretrained(
|
250 |
+
model_id,
|
251 |
+
device_map=device,
|
252 |
+
torch_dtype=torch.float32, # Use float32 for CPU
|
253 |
+
low_cpu_mem_usage=True, # Enable memory optimization
|
254 |
+
offload_folder="offload", # Enable offloading if needed
|
255 |
+
offload_state_dict=True, # Offload state dict for memory saving
|
256 |
+
max_memory={0: f"{self.memory_limit_gb}GB"} # Limit memory usage
|
257 |
+
)
|
258 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
259 |
+
except Exception as e:
|
260 |
+
print(f"Model loading error: {e}")
|
261 |
+
return {"error": f"Failed to load model: {str(e)}"}
|
262 |
+
|
263 |
+
# Update progress
|
264 |
+
with self.progress_lock:
|
265 |
+
self.progress = 30 # Determining task type
|
266 |
+
|
267 |
+
# Determine task type based on dataset features
|
268 |
+
task_type = self._determine_task_type(dataset)
|
269 |
+
|
270 |
+
# Update progress
|
271 |
+
with self.progress_lock:
|
272 |
+
self.progress = 40 # Starting evaluation
|
273 |
+
|
274 |
+
try:
|
275 |
+
# Run appropriate evaluation based on task type
|
276 |
+
if task_type == "text-generation":
|
277 |
+
results = self._evaluate_text_generation(model, tokenizer, dataset)
|
278 |
+
elif task_type == "question-answering":
|
279 |
+
results = self._evaluate_question_answering(model, tokenizer, dataset)
|
280 |
+
elif task_type == "classification":
|
281 |
+
results = self._evaluate_classification(model, tokenizer, dataset)
|
282 |
+
elif task_type == "code-generation":
|
283 |
+
results = self._evaluate_code_generation(model, tokenizer, dataset)
|
284 |
+
else:
|
285 |
+
# Default to general evaluation
|
286 |
+
results = self._evaluate_general(model, tokenizer, dataset)
|
287 |
+
except Exception as e:
|
288 |
+
print(f"Evaluation task error: {e}")
|
289 |
+
return {"error": f"Evaluation failed: {str(e)}"}
|
290 |
+
|
291 |
+
# Update progress
|
292 |
+
with self.progress_lock:
|
293 |
+
self.progress = 95 # Cleaning up
|
294 |
+
|
295 |
+
# Clean up to free memory
|
296 |
+
del model
|
297 |
+
del tokenizer
|
298 |
+
if torch.cuda.is_available():
|
299 |
+
torch.cuda.empty_cache()
|
300 |
+
|
301 |
+
# Update progress
|
302 |
+
with self.progress_lock:
|
303 |
+
self.progress = 100 # Completed
|
304 |
+
|
305 |
+
return results
|
306 |
+
|
307 |
+
def get_current_progress(self):
|
308 |
+
"""Get the current evaluation progress.
|
309 |
+
|
310 |
+
Returns:
|
311 |
+
tuple: (current_evaluation, progress_percentage)
|
312 |
+
"""
|
313 |
+
with self.progress_lock:
|
314 |
+
return self.current_evaluation, self.progress
|
315 |
+
|
316 |
+
def _determine_task_type(self, dataset):
|
317 |
+
"""Determine the task type based on dataset features.
|
318 |
+
|
319 |
+
Args:
|
320 |
+
dataset: HuggingFace dataset
|
321 |
+
|
322 |
+
Returns:
|
323 |
+
str: Task type
|
324 |
+
"""
|
325 |
+
features = dataset.features
|
326 |
+
|
327 |
+
# Check for common feature patterns
|
328 |
+
if "question" in features and "answer" in features:
|
329 |
+
return "question-answering"
|
330 |
+
elif "code" in features or "solution" in features:
|
331 |
+
return "code-generation"
|
332 |
+
elif "label" in features or "class" in features:
|
333 |
+
return "classification"
|
334 |
+
elif "input" in features and "output" in features:
|
335 |
+
return "text-generation"
|
336 |
+
else:
|
337 |
+
return "general"
|
338 |
+
|
339 |
+
def _evaluate_text_generation(self, model, tokenizer, dataset):
|
340 |
+
"""Evaluate a model on text generation tasks.
|
341 |
+
|
342 |
+
Args:
|
343 |
+
model: HuggingFace model
|
344 |
+
tokenizer: HuggingFace tokenizer
|
345 |
+
dataset: HuggingFace dataset
|
346 |
+
|
347 |
+
Returns:
|
348 |
+
dict: Evaluation results
|
349 |
+
"""
|
350 |
+
# Set up generation pipeline
|
351 |
+
generator = pipeline(
|
352 |
+
"text-generation",
|
353 |
+
model=model,
|
354 |
+
tokenizer=tokenizer,
|
355 |
+
device="cpu"
|
356 |
+
)
|
357 |
+
|
358 |
+
# Sample a subset for evaluation (to keep runtime reasonable)
|
359 |
+
if len(dataset) > 100:
|
360 |
+
dataset = dataset.select(range(100))
|
361 |
+
|
362 |
+
# Track metrics
|
363 |
+
correct = 0
|
364 |
+
total = 0
|
365 |
+
generated_texts = []
|
366 |
+
|
367 |
+
# Process each example
|
368 |
+
for i, example in enumerate(dataset):
|
369 |
+
# Update progress based on completion percentage
|
370 |
+
with self.progress_lock:
|
371 |
+
self.progress = 40 + int((i / len(dataset)) * 50)
|
372 |
+
|
373 |
+
input_text = example.get("input", example.get("prompt", ""))
|
374 |
+
expected_output = example.get("output", example.get("target", ""))
|
375 |
+
|
376 |
+
if not input_text or not expected_output:
|
377 |
+
continue
|
378 |
+
|
379 |
+
# Generate text
|
380 |
+
generated = generator(
|
381 |
+
input_text,
|
382 |
+
max_length=100,
|
383 |
+
num_return_sequences=1
|
384 |
+
)
|
385 |
+
|
386 |
+
generated_text = generated[0]["generated_text"]
|
387 |
+
generated_texts.append(generated_text)
|
388 |
+
|
389 |
+
# Simple exact match check
|
390 |
+
if expected_output.strip() in generated_text:
|
391 |
+
correct += 1
|
392 |
+
|
393 |
+
total += 1
|
394 |
+
|
395 |
+
# Calculate metrics
|
396 |
+
accuracy = correct / total if total > 0 else 0
|
397 |
+
|
398 |
+
return {
|
399 |
+
"accuracy": accuracy,
|
400 |
+
"samples_evaluated": total,
|
401 |
+
"generated_samples": generated_texts[:5] # Include a few samples
|
402 |
+
}
|
403 |
+
|
404 |
+
def _evaluate_question_answering(self, model, tokenizer, dataset):
|
405 |
+
"""Evaluate a model on question answering tasks.
|
406 |
+
|
407 |
+
Args:
|
408 |
+
model: HuggingFace model
|
409 |
+
tokenizer: HuggingFace tokenizer
|
410 |
+
dataset: HuggingFace dataset
|
411 |
+
|
412 |
+
Returns:
|
413 |
+
dict: Evaluation results
|
414 |
+
"""
|
415 |
+
# Set up QA pipeline
|
416 |
+
qa_pipeline = pipeline(
|
417 |
+
"question-answering",
|
418 |
+
model=model,
|
419 |
+
tokenizer=tokenizer,
|
420 |
+
device="cpu"
|
421 |
+
)
|
422 |
+
|
423 |
+
# Sample a subset for evaluation
|
424 |
+
if len(dataset) > 100:
|
425 |
+
dataset = dataset.select(range(100))
|
426 |
+
|
427 |
+
# Track metrics
|
428 |
+
exact_matches = 0
|
429 |
+
f1_scores = []
|
430 |
+
total = 0
|
431 |
+
|
432 |
+
# Process each example
|
433 |
+
for i, example in enumerate(dataset):
|
434 |
+
# Update progress based on completion percentage
|
435 |
+
with self.progress_lock:
|
436 |
+
self.progress = 40 + int((i / len(dataset)) * 50)
|
437 |
+
|
438 |
+
question = example.get("question", "")
|
439 |
+
context = example.get("context", "")
|
440 |
+
answer = example.get("answer", "")
|
441 |
+
|
442 |
+
if not question or not answer:
|
443 |
+
continue
|
444 |
+
|
445 |
+
# Get model prediction
|
446 |
+
if context:
|
447 |
+
result = qa_pipeline(question=question, context=context)
|
448 |
+
else:
|
449 |
+
# If no context provided, use the question as context
|
450 |
+
result = qa_pipeline(question=question, context=question)
|
451 |
+
|
452 |
+
predicted_answer = result["answer"]
|
453 |
+
|
454 |
+
# Calculate exact match
|
455 |
+
if predicted_answer.strip() == answer.strip():
|
456 |
+
exact_matches += 1
|
457 |
+
|
458 |
+
# Calculate F1 score
|
459 |
+
f1 = self._calculate_f1(answer, predicted_answer)
|
460 |
+
f1_scores.append(f1)
|
461 |
+
|
462 |
+
total += 1
|
463 |
+
|
464 |
+
# Calculate metrics
|
465 |
+
exact_match_accuracy = exact_matches / total if total > 0 else 0
|
466 |
+
avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
|
467 |
+
|
468 |
+
return {
|
469 |
+
"exact_match": exact_match_accuracy,
|
470 |
+
"f1": avg_f1,
|
471 |
+
"samples_evaluated": total
|
472 |
+
}
|
473 |
+
|
474 |
+
def _evaluate_classification(self, model, tokenizer, dataset):
|
475 |
+
"""Evaluate a model on classification tasks.
|
476 |
+
|
477 |
+
Args:
|
478 |
+
model: HuggingFace model
|
479 |
+
tokenizer: HuggingFace tokenizer
|
480 |
+
dataset: HuggingFace dataset
|
481 |
+
|
482 |
+
Returns:
|
483 |
+
dict: Evaluation results
|
484 |
+
"""
|
485 |
+
# Set up classification pipeline
|
486 |
+
classifier = pipeline(
|
487 |
+
"text-classification",
|
488 |
+
model=model,
|
489 |
+
tokenizer=tokenizer,
|
490 |
+
device="cpu"
|
491 |
+
)
|
492 |
+
|
493 |
+
# Sample a subset for evaluation
|
494 |
+
if len(dataset) > 100:
|
495 |
+
dataset = dataset.select(range(100))
|
496 |
+
|
497 |
+
# Track metrics
|
498 |
+
correct = 0
|
499 |
+
total = 0
|
500 |
+
|
501 |
+
# Process each example
|
502 |
+
for i, example in enumerate(dataset):
|
503 |
+
# Update progress based on completion percentage
|
504 |
+
with self.progress_lock:
|
505 |
+
self.progress = 40 + int((i / len(dataset)) * 50)
|
506 |
+
|
507 |
+
text = example.get("text", example.get("sentence", ""))
|
508 |
+
label = str(example.get("label", example.get("class", "")))
|
509 |
+
|
510 |
+
if not text or not label:
|
511 |
+
continue
|
512 |
+
|
513 |
+
# Get model prediction
|
514 |
+
result = classifier(text)
|
515 |
+
predicted_label = result[0]["label"]
|
516 |
+
|
517 |
+
# Check if correct
|
518 |
+
if str(predicted_label) == label:
|
519 |
+
correct += 1
|
520 |
+
|
521 |
+
total += 1
|
522 |
+
|
523 |
+
# Calculate metrics
|
524 |
+
accuracy = correct / total if total > 0 else 0
|
525 |
+
|
526 |
+
return {
|
527 |
+
"accuracy": accuracy,
|
528 |
+
"samples_evaluated": total
|
529 |
+
}
|
530 |
+
|
531 |
+
def _evaluate_code_generation(self, model, tokenizer, dataset):
|
532 |
+
"""Evaluate a model on code generation tasks.
|
533 |
+
|
534 |
+
Args:
|
535 |
+
model: HuggingFace model
|
536 |
+
tokenizer: HuggingFace tokenizer
|
537 |
+
dataset: HuggingFace dataset
|
538 |
+
|
539 |
+
Returns:
|
540 |
+
dict: Evaluation results
|
541 |
+
"""
|
542 |
+
# Set up generation pipeline
|
543 |
+
generator = pipeline(
|
544 |
+
"text-generation",
|
545 |
+
model=model,
|
546 |
+
tokenizer=tokenizer,
|
547 |
+
device="cpu"
|
548 |
+
)
|
549 |
+
|
550 |
+
# Sample a subset for evaluation
|
551 |
+
if len(dataset) > 50: # Smaller sample for code tasks
|
552 |
+
dataset = dataset.select(range(50))
|
553 |
+
|
554 |
+
# Track metrics
|
555 |
+
exact_matches = 0
|
556 |
+
functional_matches = 0
|
557 |
+
total = 0
|
558 |
+
|
559 |
+
# Process each example
|
560 |
+
for i, example in enumerate(dataset):
|
561 |
+
# Update progress based on completion percentage
|
562 |
+
with self.progress_lock:
|
563 |
+
self.progress = 40 + int((i / len(dataset)) * 50)
|
564 |
+
|
565 |
+
prompt = example.get("prompt", example.get("input", ""))
|
566 |
+
solution = example.get("solution", example.get("output", ""))
|
567 |
+
|
568 |
+
if not prompt or not solution:
|
569 |
+
continue
|
570 |
+
|
571 |
+
# Generate code
|
572 |
+
generated = generator(
|
573 |
+
prompt,
|
574 |
+
max_length=200,
|
575 |
+
num_return_sequences=1
|
576 |
+
)
|
577 |
+
|
578 |
+
generated_code = generated[0]["generated_text"]
|
579 |
+
|
580 |
+
# Extract code from generated text (remove prompt)
|
581 |
+
if prompt in generated_code:
|
582 |
+
generated_code = generated_code[len(prompt):].strip()
|
583 |
+
|
584 |
+
# Check exact match
|
585 |
+
if generated_code.strip() == solution.strip():
|
586 |
+
exact_matches += 1
|
587 |
+
functional_matches += 1
|
588 |
+
else:
|
589 |
+
# We would ideally check functional correctness here
|
590 |
+
# but that requires executing code which is complex and potentially unsafe
|
591 |
+
# For now, we'll use a simple heuristic
|
592 |
+
if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
|
593 |
+
functional_matches += 0.5 # Partial credit
|
594 |
+
|
595 |
+
total += 1
|
596 |
+
|
597 |
+
# Calculate metrics
|
598 |
+
exact_match_rate = exact_matches / total if total > 0 else 0
|
599 |
+
functional_correctness = functional_matches / total if total > 0 else 0
|
600 |
+
|
601 |
+
return {
|
602 |
+
"exact_match": exact_match_rate,
|
603 |
+
"functional_correctness": functional_correctness,
|
604 |
+
"samples_evaluated": total
|
605 |
+
}
|
606 |
+
|
607 |
+
def _evaluate_general(self, model, tokenizer, dataset):
|
608 |
+
"""General evaluation for any dataset type.
|
609 |
+
|
610 |
+
Args:
|
611 |
+
model: HuggingFace model
|
612 |
+
tokenizer: HuggingFace tokenizer
|
613 |
+
dataset: HuggingFace dataset
|
614 |
+
|
615 |
+
Returns:
|
616 |
+
dict: Evaluation results
|
617 |
+
"""
|
618 |
+
# Set up generation pipeline
|
619 |
+
generator = pipeline(
|
620 |
+
"text-generation",
|
621 |
+
model=model,
|
622 |
+
tokenizer=tokenizer,
|
623 |
+
device="cpu"
|
624 |
+
)
|
625 |
+
|
626 |
+
# Sample a subset for evaluation
|
627 |
+
if len(dataset) > 50:
|
628 |
+
dataset = dataset.select(range(50))
|
629 |
+
|
630 |
+
# Find input and output fields
|
631 |
+
features = dataset.features
|
632 |
+
input_field = None
|
633 |
+
output_field = None
|
634 |
+
|
635 |
+
for field in features:
|
636 |
+
if field.lower() in ["input", "prompt", "question", "text"]:
|
637 |
+
input_field = field
|
638 |
+
elif field.lower() in ["output", "target", "answer", "response"]:
|
639 |
+
output_field = field
|
640 |
+
|
641 |
+
if not input_field:
|
642 |
+
# Just use the first string field as input
|
643 |
+
for field in features:
|
644 |
+
if isinstance(features[field], (str, list)):
|
645 |
+
input_field = field
|
646 |
+
break
|
647 |
+
|
648 |
+
# Track metrics
|
649 |
+
total = 0
|
650 |
+
generated_texts = []
|
651 |
+
|
652 |
+
# Process each example
|
653 |
+
for i, example in enumerate(dataset):
|
654 |
+
# Update progress based on completion percentage
|
655 |
+
with self.progress_lock:
|
656 |
+
self.progress = 40 + int((i / len(dataset)) * 50)
|
657 |
+
|
658 |
+
if input_field and input_field in example:
|
659 |
+
input_text = str(example[input_field])
|
660 |
+
|
661 |
+
# Generate text
|
662 |
+
generated = generator(
|
663 |
+
input_text,
|
664 |
+
max_length=100,
|
665 |
+
num_return_sequences=1
|
666 |
+
)
|
667 |
+
|
668 |
+
generated_text = generated[0]["generated_text"]
|
669 |
+
generated_texts.append({
|
670 |
+
"input": input_text,
|
671 |
+
"output": generated_text,
|
672 |
+
"expected": str(example[output_field]) if output_field and output_field in example else "N/A"
|
673 |
+
})
|
674 |
+
|
675 |
+
total += 1
|
676 |
+
|
677 |
+
return {
|
678 |
+
"samples_evaluated": total,
|
679 |
+
"generated_samples": generated_texts[:5] # Include a few samples
|
680 |
+
}
|
681 |
+
|
682 |
+
def _calculate_f1(self, answer, prediction):
|
683 |
+
"""Calculate F1 score between answer and prediction.
|
684 |
+
|
685 |
+
Args:
|
686 |
+
answer: Ground truth answer
|
687 |
+
prediction: Model prediction
|
688 |
+
|
689 |
+
Returns:
|
690 |
+
float: F1 score
|
691 |
+
"""
|
692 |
+
# Tokenize
|
693 |
+
answer_tokens = answer.lower().split()
|
694 |
+
prediction_tokens = prediction.lower().split()
|
695 |
+
|
696 |
+
# Calculate precision and recall
|
697 |
+
common_tokens = set(answer_tokens) & set(prediction_tokens)
|
698 |
+
|
699 |
+
if not common_tokens:
|
700 |
+
return 0.0
|
701 |
+
|
702 |
+
precision = len(common_tokens) / len(prediction_tokens)
|
703 |
+
recall = len(common_tokens) / len(answer_tokens)
|
704 |
+
|
705 |
+
# Calculate F1
|
706 |
+
if precision + recall == 0:
|
707 |
+
return 0.0
|
708 |
+
|
709 |
+
f1 = 2 * precision * recall / (precision + recall)
|
710 |
+
return f1
|
711 |
+
|
712 |
+
def _calculate_overall_score(self, results):
|
713 |
+
"""Calculate an overall score from evaluation results.
|
714 |
+
|
715 |
+
Args:
|
716 |
+
results: Evaluation results dictionary
|
717 |
+
|
718 |
+
Returns:
|
719 |
+
float: Overall score between 0 and 100
|
720 |
+
"""
|
721 |
+
# If there was an error, return a low score
|
722 |
+
if "error" in results:
|
723 |
+
return 0.0
|
724 |
+
|
725 |
+
score = 0.0
|
726 |
+
|
727 |
+
# Check for common metrics and weight them
|
728 |
+
if "accuracy" in results:
|
729 |
+
score += results["accuracy"] * 100
|
730 |
+
|
731 |
+
if "exact_match" in results:
|
732 |
+
score += results["exact_match"] * 100
|
733 |
+
|
734 |
+
if "f1" in results:
|
735 |
+
score += results["f1"] * 100
|
736 |
+
|
737 |
+
if "functional_correctness" in results:
|
738 |
+
score += results["functional_correctness"] * 100
|
739 |
+
|
740 |
+
# If multiple metrics were found, average them
|
741 |
+
num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
|
742 |
+
|
743 |
+
if num_metrics > 0:
|
744 |
+
score /= num_metrics
|
745 |
+
else:
|
746 |
+
# Default score if no metrics available
|
747 |
+
score = 50.0
|
748 |
+
|
749 |
+
return score
|
750 |
+
|
751 |
+
def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
|
752 |
+
"""Submit a model for evaluation on a benchmark.
|
753 |
+
|
754 |
+
Args:
|
755 |
+
model_id: Model ID in the database
|
756 |
+
benchmark_id: Benchmark ID in the database
|
757 |
+
user_id: User ID submitting the evaluation
|
758 |
+
priority: Queue priority (higher = higher priority)
|
759 |
+
|
760 |
+
Returns:
|
761 |
+
tuple: (evaluation_id, message)
|
762 |
+
"""
|
763 |
+
# Check if user can submit today
|
764 |
+
if not self.auth_manager.can_submit_benchmark(user_id):
|
765 |
+
return None, "Daily submission limit reached. Try again tomorrow."
|
766 |
+
|
767 |
+
try:
|
768 |
+
# Get model HuggingFace ID to check size
|
769 |
+
model_info = self.db_manager.get_model(model_id)
|
770 |
+
if not model_info:
|
771 |
+
return None, "Model not found in database."
|
772 |
+
|
773 |
+
# Check if model will fit in memory
|
774 |
+
will_fit, message = self.check_model_size(model_info['hf_model_id'])
|
775 |
+
|
776 |
+
if not will_fit:
|
777 |
+
return None, message
|
778 |
+
|
779 |
+
# Add evaluation to database and queue
|
780 |
+
evaluation_id = self.db_manager.add_evaluation(
|
781 |
+
model_id=model_id,
|
782 |
+
benchmark_id=benchmark_id,
|
783 |
+
priority=priority
|
784 |
+
)
|
785 |
+
|
786 |
+
# Update user's last submission date
|
787 |
+
self.auth_manager.update_submission_date(user_id)
|
788 |
+
|
789 |
+
# Make sure worker is running
|
790 |
+
self.start_worker()
|
791 |
+
|
792 |
+
return evaluation_id, f"Evaluation submitted successfully. {message}"
|
793 |
+
except Exception as e:
|
794 |
+
print(f"Submit evaluation error: {e}")
|
795 |
+
return None, f"Failed to submit evaluation: {str(e)}"
|
796 |
+
|
797 |
+
def get_queue_status(self):
|
798 |
+
"""Get the current status of the evaluation queue.
|
799 |
+
|
800 |
+
Returns:
|
801 |
+
dict: Queue status information
|
802 |
+
"""
|
803 |
+
try:
|
804 |
+
# Get evaluations from database
|
805 |
+
pending_evals = self.db_manager.get_evaluation_results(status="pending")
|
806 |
+
running_evals = self.db_manager.get_evaluation_results(status="running")
|
807 |
+
completed_evals = self.db_manager.get_evaluation_results(status="completed")
|
808 |
+
failed_evals = self.db_manager.get_evaluation_results(status="failed")
|
809 |
+
|
810 |
+
# Get current evaluation progress
|
811 |
+
current_eval, progress = self.get_current_progress()
|
812 |
+
|
813 |
+
return {
|
814 |
+
"pending": len(pending_evals),
|
815 |
+
"running": len(running_evals),
|
816 |
+
"completed": len(completed_evals),
|
817 |
+
"failed": len(failed_evals),
|
818 |
+
"is_processing": self.is_processing,
|
819 |
+
"current_evaluation": current_eval,
|
820 |
+
"progress": progress,
|
821 |
+
"memory_limit_gb": self.memory_limit_gb
|
822 |
+
}
|
823 |
+
except Exception as e:
|
824 |
+
print(f"Queue status error: {e}")
|
825 |
+
return {
|
826 |
+
"pending": 0,
|
827 |
+
"running": 0,
|
828 |
+
"completed": 0,
|
829 |
+
"failed": 0,
|
830 |
+
"is_processing": self.is_processing,
|
831 |
+
"current_evaluation": None,
|
832 |
+
"progress": 0,
|
833 |
+
"memory_limit_gb": self.memory_limit_gb,
|
834 |
+
"error": str(e)
|
835 |
+
}
|
836 |
+
|
837 |
+
# Model submission UI components
|
838 |
+
def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
839 |
+
"""Create the model submission UI components.
|
840 |
+
|
841 |
+
Args:
|
842 |
+
evaluation_queue: Evaluation queue instance
|
843 |
+
auth_manager: Authentication manager instance
|
844 |
+
db_manager: Database manager instance
|
845 |
+
|
846 |
+
Returns:
|
847 |
+
gr.Blocks: Gradio Blocks component with model submission UI
|
848 |
+
"""
|
849 |
+
with gr.Blocks() as submission_ui:
|
850 |
+
# Store user authentication state
|
851 |
+
user_state = gr.State(None)
|
852 |
+
|
853 |
+
# Check authentication on load
|
854 |
+
def check_auth_on_load(request: gr.Request):
|
855 |
+
if request:
|
856 |
+
# Special handling for HF Spaces OAuth
|
857 |
+
if 'SPACE_ID' in os.environ:
|
858 |
+
username = request.headers.get("HF-User")
|
859 |
+
if username:
|
860 |
+
user = db_manager.get_user_by_username(username)
|
861 |
+
if user:
|
862 |
+
print(f"User authenticated via HF Spaces OAuth: {username}")
|
863 |
+
return user
|
864 |
+
else:
|
865 |
+
# Standard token-based auth
|
866 |
+
user = auth_manager.check_login(request)
|
867 |
+
if user:
|
868 |
+
return user
|
869 |
+
return None
|
870 |
+
|
871 |
+
with gr.Tab("Submit Model"):
|
872 |
+
gr.Markdown(f"""
|
873 |
+
### Model Size Restrictions
|
874 |
+
|
875 |
+
Models must fit within {evaluation_queue.memory_limit_gb}GB of RAM for evaluation.
|
876 |
+
Large models will be rejected to ensure all evaluations can complete successfully.
|
877 |
+
""", elem_classes=["info-text"])
|
878 |
+
|
879 |
+
with gr.Row():
|
880 |
+
with gr.Column(scale=2):
|
881 |
+
model_id_input = gr.Textbox(
|
882 |
+
placeholder="HuggingFace model ID (e.g., 'gpt2', 'facebook/opt-350m')",
|
883 |
+
label="Model ID"
|
884 |
+
)
|
885 |
+
|
886 |
+
check_size_button = gr.Button("Check Model Size")
|
887 |
+
size_check_result = gr.Markdown("")
|
888 |
+
model_name_input = gr.Textbox(
|
889 |
+
placeholder="Display name for your model",
|
890 |
+
label="Model Name"
|
891 |
+
)
|
892 |
+
|
893 |
+
model_description_input = gr.Textbox(
|
894 |
+
placeholder="Brief description of your model",
|
895 |
+
label="Description",
|
896 |
+
lines=3
|
897 |
+
)
|
898 |
+
|
899 |
+
model_parameters_input = gr.Number(
|
900 |
+
label="Number of Parameters (billions)",
|
901 |
+
precision=2
|
902 |
+
)
|
903 |
+
|
904 |
+
with gr.Column(scale=1):
|
905 |
+
model_tag_input = gr.Dropdown(
|
906 |
+
choices=evaluation_queue.model_tags,
|
907 |
+
label="Model Tag",
|
908 |
+
info="Select one category that best describes your model"
|
909 |
+
)
|
910 |
+
|
911 |
+
# Fixed benchmark dropdown to properly show names
|
912 |
+
benchmark_dropdown = gr.Dropdown(
|
913 |
+
label="Benchmark",
|
914 |
+
info="Select a benchmark to evaluate your model on",
|
915 |
+
choices=[("none", "Loading benchmarks...")],
|
916 |
+
value=None
|
917 |
+
)
|
918 |
+
|
919 |
+
refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
|
920 |
+
|
921 |
+
submit_model_button = gr.Button("Submit for Evaluation")
|
922 |
+
submission_status = gr.Markdown("")
|
923 |
+
auth_message = gr.Markdown("")
|
924 |
+
|
925 |
+
with gr.Tab("Evaluation Queue"):
|
926 |
+
refresh_queue_button = gr.Button("Refresh Queue")
|
927 |
+
|
928 |
+
with gr.Row():
|
929 |
+
with gr.Column(scale=1):
|
930 |
+
queue_stats = gr.JSON(
|
931 |
+
label="Queue Statistics"
|
932 |
+
)
|
933 |
+
|
934 |
+
with gr.Column(scale=2):
|
935 |
+
queue_status = gr.Dataframe(
|
936 |
+
headers=["ID", "Model", "Benchmark", "Status", "Submitted"],
|
937 |
+
label="Recent Evaluations"
|
938 |
+
)
|
939 |
+
|
940 |
+
with gr.Row(visible=True) as progress_container:
|
941 |
+
with gr.Column():
|
942 |
+
current_eval_info = gr.Markdown("No evaluation currently running")
|
943 |
+
# Use a simple text display for progress instead of Progress component
|
944 |
+
progress_display = gr.Markdown("Progress: 0%")
|
945 |
+
|
946 |
+
# Event handlers
|
947 |
+
def check_model_size_handler(model_id):
|
948 |
+
if not model_id:
|
949 |
+
return "Please enter a HuggingFace model ID."
|
950 |
+
|
951 |
+
try:
|
952 |
+
will_fit, message = evaluation_queue.check_model_size(model_id)
|
953 |
+
|
954 |
+
if will_fit:
|
955 |
+
return f"✅ {message}"
|
956 |
+
else:
|
957 |
+
return f"❌ {message}"
|
958 |
+
except Exception as e:
|
959 |
+
print(f"Model size check error: {e}")
|
960 |
+
import traceback
|
961 |
+
traceback.print_exc()
|
962 |
+
return f"Error checking model size: {str(e)}"
|
963 |
+
|
964 |
+
def refresh_benchmarks_handler():
|
965 |
+
benchmarks = db_manager.get_benchmarks()
|
966 |
+
|
967 |
+
# Format for dropdown - properly formatted to display names
|
968 |
+
choices = []
|
969 |
+
for b in benchmarks:
|
970 |
+
# Add as tuple of (id, name) to ensure proper display
|
971 |
+
choices.append((str(b["id"]), b["name"]))
|
972 |
+
|
973 |
+
if not choices:
|
974 |
+
choices = [("none", "No benchmarks available - add some first")]
|
975 |
+
|
976 |
+
return gr.update(choices=choices)
|
977 |
+
|
978 |
+
def submit_model_handler(model_id, model_name, model_description, model_parameters, model_tag, benchmark_id, user):
|
979 |
+
# Check if user is logged in
|
980 |
+
if not user:
|
981 |
+
return "Please log in to submit a model."
|
982 |
+
|
983 |
+
if not model_id or not model_name or not model_tag or not benchmark_id:
|
984 |
+
return "Please fill in all required fields."
|
985 |
+
|
986 |
+
if benchmark_id == "none":
|
987 |
+
return "Please select a valid benchmark."
|
988 |
+
|
989 |
+
try:
|
990 |
+
# Check if model will fit in RAM
|
991 |
+
will_fit, size_message = evaluation_queue.check_model_size(model_id)
|
992 |
+
|
993 |
+
if not will_fit:
|
994 |
+
return f"❌ {size_message}"
|
995 |
+
|
996 |
+
# Add model to database
|
997 |
+
model_db_id = db_manager.add_model(
|
998 |
+
name=model_name,
|
999 |
+
hf_model_id=model_id,
|
1000 |
+
user_id=user["id"],
|
1001 |
+
tag=model_tag,
|
1002 |
+
parameters=str(model_parameters) if model_parameters else None,
|
1003 |
+
description=model_description
|
1004 |
+
)
|
1005 |
+
|
1006 |
+
if not model_db_id:
|
1007 |
+
return "Failed to add model to database."
|
1008 |
+
|
1009 |
+
# Submit for evaluation
|
1010 |
+
eval_id, message = evaluation_queue.submit_evaluation(
|
1011 |
+
model_id=model_db_id,
|
1012 |
+
benchmark_id=benchmark_id,
|
1013 |
+
user_id=user["id"]
|
1014 |
+
)
|
1015 |
+
|
1016 |
+
if eval_id:
|
1017 |
+
return f"✅ Model submitted successfully. {size_message}\nEvaluation ID: {eval_id}"
|
1018 |
+
else:
|
1019 |
+
return message
|
1020 |
+
except Exception as e:
|
1021 |
+
print(f"Error submitting model: {str(e)}")
|
1022 |
+
import traceback
|
1023 |
+
traceback.print_exc()
|
1024 |
+
return f"Error submitting model: {str(e)}"
|
1025 |
+
|
1026 |
+
def refresh_queue_handler():
|
1027 |
+
# Get queue statistics
|
1028 |
+
stats = evaluation_queue.get_queue_status()
|
1029 |
+
|
1030 |
+
# Get recent evaluations (all statuses, limited to 20)
|
1031 |
+
evals = db_manager.get_evaluation_results(limit=20)
|
1032 |
+
|
1033 |
+
# Format for dataframe
|
1034 |
+
eval_data = []
|
1035 |
+
for eval in evals:
|
1036 |
+
eval_data.append([
|
1037 |
+
eval["id"],
|
1038 |
+
eval["model_name"],
|
1039 |
+
eval["benchmark_name"],
|
1040 |
+
eval["status"],
|
1041 |
+
eval["submitted_at"]
|
1042 |
+
])
|
1043 |
+
|
1044 |
+
# Also update progress display
|
1045 |
+
current_eval, progress = evaluation_queue.get_current_progress()
|
1046 |
+
if current_eval:
|
1047 |
+
model_info = db_manager.get_model(current_eval['model_id'])
|
1048 |
+
benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
|
1049 |
+
|
1050 |
+
if model_info and benchmark_info:
|
1051 |
+
eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
|
1052 |
+
progress_text = f"Progress: {progress}%"
|
1053 |
+
return stats, eval_data, eval_info, progress_text
|
1054 |
+
|
1055 |
+
return stats, eval_data, "No evaluation currently running", "Progress: 0%"
|
1056 |
+
|
1057 |
+
# Update authentication status
|
1058 |
+
def update_auth_message(user):
|
1059 |
+
if user:
|
1060 |
+
return f"Logged in as {user['username']}"
|
1061 |
+
else:
|
1062 |
+
return "Please log in to submit a model."
|
1063 |
+
|
1064 |
+
# Connect event handlers
|
1065 |
+
check_size_button.click(
|
1066 |
+
fn=check_model_size_handler,
|
1067 |
+
inputs=[model_id_input],
|
1068 |
+
outputs=[size_check_result]
|
1069 |
+
)
|
1070 |
+
|
1071 |
+
refresh_benchmarks_button.click(
|
1072 |
+
fn=refresh_benchmarks_handler,
|
1073 |
+
inputs=[],
|
1074 |
+
outputs=[benchmark_dropdown]
|
1075 |
+
)
|
1076 |
+
|
1077 |
+
submit_model_button.click(
|
1078 |
+
fn=submit_model_handler,
|
1079 |
+
inputs=[
|
1080 |
+
model_id_input,
|
1081 |
+
model_name_input,
|
1082 |
+
model_description_input,
|
1083 |
+
model_parameters_input,
|
1084 |
+
model_tag_input,
|
1085 |
+
benchmark_dropdown,
|
1086 |
+
user_state
|
1087 |
+
],
|
1088 |
+
outputs=[submission_status]
|
1089 |
+
)
|
1090 |
+
|
1091 |
+
refresh_queue_button.click(
|
1092 |
+
fn=refresh_queue_handler,
|
1093 |
+
inputs=[],
|
1094 |
+
outputs=[queue_stats, queue_status, current_eval_info, progress_display]
|
1095 |
+
)
|
1096 |
+
|
1097 |
+
# Initialize on load
|
1098 |
+
submission_ui.load(
|
1099 |
+
fn=check_auth_on_load,
|
1100 |
+
inputs=[],
|
1101 |
+
outputs=[user_state]
|
1102 |
+
)
|
1103 |
+
|
1104 |
+
submission_ui.load(
|
1105 |
+
fn=lambda user: update_auth_message(user),
|
1106 |
+
inputs=[user_state],
|
1107 |
+
outputs=[auth_message]
|
1108 |
+
)
|
1109 |
+
|
1110 |
+
submission_ui.load(
|
1111 |
+
fn=refresh_benchmarks_handler,
|
1112 |
+
inputs=[],
|
1113 |
+
outputs=[benchmark_dropdown]
|
1114 |
+
)
|
1115 |
+
|
1116 |
+
submission_ui.load(
|
1117 |
+
fn=refresh_queue_handler,
|
1118 |
+
inputs=[],
|
1119 |
+
outputs=[queue_stats, queue_status, current_eval_info, progress_display]
|
1120 |
+
)
|
1121 |
+
|
1122 |
+
return submission_ui
|
leaderboard.py
ADDED
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Leaderboard module for Dynamic Highscores system.
|
3 |
+
|
4 |
+
This module implements the unified leaderboard with tag-based filtering
|
5 |
+
for displaying all evaluated models.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import pandas as pd
|
11 |
+
import gradio as gr
|
12 |
+
import plotly.express as px
|
13 |
+
import plotly.graph_objects as go
|
14 |
+
|
15 |
+
class Leaderboard:
|
16 |
+
"""Manages the unified leaderboard with filtering capabilities."""
|
17 |
+
|
18 |
+
def __init__(self, db_manager):
|
19 |
+
"""Initialize the leaderboard manager.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
db_manager: Database manager instance
|
23 |
+
"""
|
24 |
+
self.db_manager = db_manager
|
25 |
+
self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
|
26 |
+
|
27 |
+
# Define color scheme for tags
|
28 |
+
self.tag_colors = {
|
29 |
+
"Merge": "#FF6B6B",
|
30 |
+
"Agent": "#4ECDC4",
|
31 |
+
"Reasoning": "#FFD166",
|
32 |
+
"Coding": "#6B5B95",
|
33 |
+
"General": "#88D8B0",
|
34 |
+
"Specialized": "#FF8C42",
|
35 |
+
"Instruction": "#5D9CEC",
|
36 |
+
"Chat": "#AC92EB"
|
37 |
+
}
|
38 |
+
|
39 |
+
def get_leaderboard_data(self, tag=None, benchmark_id=None):
|
40 |
+
"""Get leaderboard data, optionally filtered by tag or benchmark.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
tag: Model tag to filter by (None for all)
|
44 |
+
benchmark_id: Benchmark ID to filter by (None for all)
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
pd.DataFrame: Leaderboard data
|
48 |
+
"""
|
49 |
+
# Get evaluation results from database
|
50 |
+
if tag and tag != "All":
|
51 |
+
df = self.db_manager.get_leaderboard_df(tag=tag, benchmark_id=benchmark_id)
|
52 |
+
else:
|
53 |
+
df = self.db_manager.get_leaderboard_df(benchmark_id=benchmark_id)
|
54 |
+
|
55 |
+
return df
|
56 |
+
|
57 |
+
def format_leaderboard_for_display(self, df):
|
58 |
+
"""Format leaderboard data for display.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
df: Leaderboard DataFrame
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
pd.DataFrame: Formatted leaderboard for display
|
65 |
+
"""
|
66 |
+
if df.empty:
|
67 |
+
return pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
|
68 |
+
|
69 |
+
# Select and rename columns for display
|
70 |
+
display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
|
71 |
+
display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
|
72 |
+
|
73 |
+
# Round score to 2 decimal places
|
74 |
+
display_df['Score'] = display_df['Score'].round(2)
|
75 |
+
|
76 |
+
# Sort by score (descending)
|
77 |
+
display_df = display_df.sort_values('Score', ascending=False)
|
78 |
+
|
79 |
+
return display_df
|
80 |
+
|
81 |
+
def create_performance_chart(self, df, chart_type="bar"):
|
82 |
+
"""Create a performance chart from leaderboard data.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
df: Leaderboard DataFrame
|
86 |
+
chart_type: Type of chart to create ("bar" or "scatter")
|
87 |
+
|
88 |
+
Returns:
|
89 |
+
plotly.graph_objects.Figure: Performance chart
|
90 |
+
"""
|
91 |
+
if df.empty:
|
92 |
+
# Return empty figure
|
93 |
+
fig = go.Figure()
|
94 |
+
fig.update_layout(
|
95 |
+
title="No data available",
|
96 |
+
xaxis_title="Model",
|
97 |
+
yaxis_title="Score"
|
98 |
+
)
|
99 |
+
return fig
|
100 |
+
|
101 |
+
# Prepare data for visualization
|
102 |
+
plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
|
103 |
+
plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
|
104 |
+
|
105 |
+
# Create chart based on type
|
106 |
+
if chart_type == "scatter":
|
107 |
+
fig = px.scatter(
|
108 |
+
plot_df,
|
109 |
+
x="Model",
|
110 |
+
y="Score",
|
111 |
+
color="Tag",
|
112 |
+
symbol="Benchmark",
|
113 |
+
size="Score",
|
114 |
+
hover_data=["Model", "Benchmark", "Score"],
|
115 |
+
color_discrete_map=self.tag_colors
|
116 |
+
)
|
117 |
+
else: # Default to bar chart
|
118 |
+
fig = px.bar(
|
119 |
+
plot_df,
|
120 |
+
x="Model",
|
121 |
+
y="Score",
|
122 |
+
color="Tag",
|
123 |
+
barmode="group",
|
124 |
+
hover_data=["Model", "Benchmark", "Score"],
|
125 |
+
color_discrete_map=self.tag_colors
|
126 |
+
)
|
127 |
+
|
128 |
+
# Customize layout
|
129 |
+
fig.update_layout(
|
130 |
+
title="Model Performance Comparison",
|
131 |
+
xaxis_title="Model",
|
132 |
+
yaxis_title="Score",
|
133 |
+
legend_title="Tag",
|
134 |
+
font=dict(size=12)
|
135 |
+
)
|
136 |
+
|
137 |
+
return fig
|
138 |
+
|
139 |
+
def create_tag_distribution_chart(self, df):
|
140 |
+
"""Create a chart showing distribution of models by tag.
|
141 |
+
|
142 |
+
Args:
|
143 |
+
df: Leaderboard DataFrame
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
plotly.graph_objects.Figure: Tag distribution chart
|
147 |
+
"""
|
148 |
+
if df.empty:
|
149 |
+
# Return empty figure
|
150 |
+
fig = go.Figure()
|
151 |
+
fig.update_layout(
|
152 |
+
title="No data available",
|
153 |
+
xaxis_title="Tag",
|
154 |
+
yaxis_title="Count"
|
155 |
+
)
|
156 |
+
return fig
|
157 |
+
|
158 |
+
# Count models by tag
|
159 |
+
tag_counts = df['tag'].value_counts().reset_index()
|
160 |
+
tag_counts.columns = ['Tag', 'Count']
|
161 |
+
|
162 |
+
# Create pie chart
|
163 |
+
fig = px.pie(
|
164 |
+
tag_counts,
|
165 |
+
names='Tag',
|
166 |
+
values='Count',
|
167 |
+
title='Model Distribution by Tag',
|
168 |
+
color='Tag',
|
169 |
+
color_discrete_map=self.tag_colors
|
170 |
+
)
|
171 |
+
|
172 |
+
# Customize layout
|
173 |
+
fig.update_layout(
|
174 |
+
font=dict(size=12)
|
175 |
+
)
|
176 |
+
|
177 |
+
return fig
|
178 |
+
|
179 |
+
def create_benchmark_comparison_chart(self, df):
|
180 |
+
"""Create a chart comparing performance across benchmarks.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
df: Leaderboard DataFrame
|
184 |
+
|
185 |
+
Returns:
|
186 |
+
plotly.graph_objects.Figure: Benchmark comparison chart
|
187 |
+
"""
|
188 |
+
if df.empty:
|
189 |
+
# Return empty figure
|
190 |
+
fig = go.Figure()
|
191 |
+
fig.update_layout(
|
192 |
+
title="No data available",
|
193 |
+
xaxis_title="Benchmark",
|
194 |
+
yaxis_title="Average Score"
|
195 |
+
)
|
196 |
+
return fig
|
197 |
+
|
198 |
+
# Calculate average score by benchmark
|
199 |
+
benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
|
200 |
+
benchmark_avg.columns = ['Benchmark', 'Average Score']
|
201 |
+
|
202 |
+
# Create bar chart
|
203 |
+
fig = px.bar(
|
204 |
+
benchmark_avg,
|
205 |
+
x='Benchmark',
|
206 |
+
y='Average Score',
|
207 |
+
title='Average Performance by Benchmark',
|
208 |
+
color='Benchmark'
|
209 |
+
)
|
210 |
+
|
211 |
+
# Customize layout
|
212 |
+
fig.update_layout(
|
213 |
+
xaxis_title="Benchmark",
|
214 |
+
yaxis_title="Average Score",
|
215 |
+
font=dict(size=12)
|
216 |
+
)
|
217 |
+
|
218 |
+
return fig
|
219 |
+
|
220 |
+
# Leaderboard UI components
|
221 |
+
def create_leaderboard_ui(leaderboard, db_manager):
|
222 |
+
"""Create the leaderboard UI components.
|
223 |
+
|
224 |
+
Args:
|
225 |
+
leaderboard: Leaderboard instance
|
226 |
+
db_manager: Database manager instance
|
227 |
+
|
228 |
+
Returns:
|
229 |
+
gr.Blocks: Gradio Blocks component with leaderboard UI
|
230 |
+
"""
|
231 |
+
with gr.Blocks() as leaderboard_ui:
|
232 |
+
gr.Markdown("# Dynamic Highscores Leaderboard")
|
233 |
+
|
234 |
+
with gr.Row():
|
235 |
+
with gr.Column(scale=1):
|
236 |
+
tag_filter = gr.Dropdown(
|
237 |
+
choices=leaderboard.model_tags,
|
238 |
+
value="All",
|
239 |
+
label="Filter by Tag"
|
240 |
+
)
|
241 |
+
|
242 |
+
benchmark_filter = gr.Dropdown(
|
243 |
+
choices=[("all", "All Benchmarks")],
|
244 |
+
value="all",
|
245 |
+
label="Filter by Benchmark"
|
246 |
+
)
|
247 |
+
|
248 |
+
refresh_button = gr.Button("Refresh Leaderboard")
|
249 |
+
|
250 |
+
with gr.Column(scale=2):
|
251 |
+
chart_type = gr.Radio(
|
252 |
+
choices=["bar", "scatter"],
|
253 |
+
value="bar",
|
254 |
+
label="Chart Type"
|
255 |
+
)
|
256 |
+
|
257 |
+
view_type = gr.Radio(
|
258 |
+
choices=["Table", "Chart", "Dashboard"],
|
259 |
+
value="Table",
|
260 |
+
label="View Type"
|
261 |
+
)
|
262 |
+
|
263 |
+
# Table view
|
264 |
+
leaderboard_table = gr.Dataframe(
|
265 |
+
headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
|
266 |
+
label="Leaderboard",
|
267 |
+
visible=True
|
268 |
+
)
|
269 |
+
|
270 |
+
# Chart view
|
271 |
+
with gr.Row(visible=False) as chart_view:
|
272 |
+
performance_chart = gr.Plot(label="Performance Chart")
|
273 |
+
|
274 |
+
# Dashboard view
|
275 |
+
with gr.Row(visible=False) as dashboard_view:
|
276 |
+
with gr.Column(scale=2):
|
277 |
+
dashboard_performance_chart = gr.Plot(label="Performance Comparison")
|
278 |
+
|
279 |
+
with gr.Column(scale=1):
|
280 |
+
with gr.Row():
|
281 |
+
tag_distribution_chart = gr.Plot(label="Model Distribution")
|
282 |
+
|
283 |
+
with gr.Row():
|
284 |
+
benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
|
285 |
+
|
286 |
+
# Event handlers
|
287 |
+
def refresh_benchmarks():
|
288 |
+
try:
|
289 |
+
benchmarks = db_manager.get_benchmarks()
|
290 |
+
|
291 |
+
# Format for dropdown
|
292 |
+
choices = [("all", "All Benchmarks")]
|
293 |
+
choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
|
294 |
+
|
295 |
+
return gr.update(choices=choices)
|
296 |
+
except Exception as e:
|
297 |
+
print(f"Error refreshing benchmarks: {e}")
|
298 |
+
return gr.update(choices=[("all", "All Benchmarks")])
|
299 |
+
|
300 |
+
def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
|
301 |
+
try:
|
302 |
+
# Get leaderboard data
|
303 |
+
if benchmark_id == "all":
|
304 |
+
benchmark_id = None
|
305 |
+
|
306 |
+
df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
|
307 |
+
|
308 |
+
# Format for display
|
309 |
+
display_df = leaderboard.format_leaderboard_for_display(df)
|
310 |
+
|
311 |
+
# Create charts
|
312 |
+
perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
|
313 |
+
tag_chart = leaderboard.create_tag_distribution_chart(df)
|
314 |
+
benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
|
315 |
+
|
316 |
+
# Update visibility based on view type
|
317 |
+
table_visible = view_type_val == "Table"
|
318 |
+
chart_visible = view_type_val == "Chart"
|
319 |
+
dashboard_visible = view_type_val == "Dashboard"
|
320 |
+
|
321 |
+
return (
|
322 |
+
display_df,
|
323 |
+
perf_chart,
|
324 |
+
perf_chart, # Same chart for both views
|
325 |
+
tag_chart,
|
326 |
+
benchmark_chart,
|
327 |
+
gr.update(visible=table_visible),
|
328 |
+
gr.update(visible=chart_visible),
|
329 |
+
gr.update(visible=dashboard_visible)
|
330 |
+
)
|
331 |
+
except Exception as e:
|
332 |
+
print(f"Error updating leaderboard: {e}")
|
333 |
+
empty_df = pd.DataFrame(columns=['Model', 'Benchmark', 'Tag', 'Score', 'Completed'])
|
334 |
+
empty_chart = go.Figure()
|
335 |
+
empty_chart.update_layout(title="Error loading data")
|
336 |
+
|
337 |
+
return (
|
338 |
+
empty_df,
|
339 |
+
empty_chart,
|
340 |
+
empty_chart,
|
341 |
+
empty_chart,
|
342 |
+
empty_chart,
|
343 |
+
gr.update(visible=True),
|
344 |
+
gr.update(visible=False),
|
345 |
+
gr.update(visible=False)
|
346 |
+
)
|
347 |
+
|
348 |
+
# Connect event handlers
|
349 |
+
refresh_button.click(
|
350 |
+
fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
|
351 |
+
inputs=[tag_filter, benchmark_filter, chart_type, view_type],
|
352 |
+
outputs=[
|
353 |
+
leaderboard_table,
|
354 |
+
performance_chart,
|
355 |
+
dashboard_performance_chart,
|
356 |
+
tag_distribution_chart,
|
357 |
+
benchmark_comparison_chart,
|
358 |
+
leaderboard_table,
|
359 |
+
chart_view,
|
360 |
+
dashboard_view
|
361 |
+
]
|
362 |
+
)
|
363 |
+
|
364 |
+
view_type.change(
|
365 |
+
fn=lambda view_t: (
|
366 |
+
gr.update(visible=view_t == "Table"),
|
367 |
+
gr.update(visible=view_t == "Chart"),
|
368 |
+
gr.update(visible=view_t == "Dashboard")
|
369 |
+
),
|
370 |
+
inputs=[view_type],
|
371 |
+
outputs=[leaderboard_table, chart_view, dashboard_view]
|
372 |
+
)
|
373 |
+
|
374 |
+
# Initialize on load
|
375 |
+
leaderboard_ui.load(
|
376 |
+
fn=refresh_benchmarks,
|
377 |
+
inputs=[],
|
378 |
+
outputs=[benchmark_filter]
|
379 |
+
)
|
380 |
+
|
381 |
+
leaderboard_ui.load(
|
382 |
+
fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
|
383 |
+
inputs=[],
|
384 |
+
outputs=[
|
385 |
+
leaderboard_table,
|
386 |
+
performance_chart,
|
387 |
+
dashboard_performance_chart,
|
388 |
+
tag_distribution_chart,
|
389 |
+
benchmark_comparison_chart,
|
390 |
+
leaderboard_table,
|
391 |
+
chart_view,
|
392 |
+
dashboard_view
|
393 |
+
]
|
394 |
+
)
|
395 |
+
|
396 |
+
return leaderboard_ui
|
model_config.py
ADDED
@@ -0,0 +1,874 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Community framework documentation and model configuration system for Dynamic Highscores.
|
3 |
+
|
4 |
+
This module provides information about the framework and implements a modular
|
5 |
+
system for model configurations.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import gradio as gr
|
11 |
+
from huggingface_hub import HfApi
|
12 |
+
|
13 |
+
class ModelConfigManager:
|
14 |
+
"""Manages model configurations for evaluation."""
|
15 |
+
|
16 |
+
def __init__(self, db_manager):
|
17 |
+
"""Initialize the model configuration manager.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
db_manager: Database manager instance
|
21 |
+
"""
|
22 |
+
self.db_manager = db_manager
|
23 |
+
self.config_dir = "model_configs"
|
24 |
+
|
25 |
+
# Ensure config directory exists
|
26 |
+
os.makedirs(self.config_dir, exist_ok=True)
|
27 |
+
|
28 |
+
# Default configurations for popular models
|
29 |
+
self.default_configs = {
|
30 |
+
"gemma": {
|
31 |
+
"name": "Gemma",
|
32 |
+
"description": "Configuration for Gemma models",
|
33 |
+
"parameters": {
|
34 |
+
"temperature": 1.0,
|
35 |
+
"top_k": 64,
|
36 |
+
"min_p": 0.01,
|
37 |
+
"top_p": 0.95,
|
38 |
+
"repetition_penalty": 1.0
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"llama": {
|
42 |
+
"name": "LLaMA",
|
43 |
+
"description": "Configuration for LLaMA models",
|
44 |
+
"parameters": {
|
45 |
+
"temperature": 0.8,
|
46 |
+
"top_k": 40,
|
47 |
+
"top_p": 0.9,
|
48 |
+
"repetition_penalty": 1.1
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"mistral": {
|
52 |
+
"name": "Mistral",
|
53 |
+
"description": "Configuration for Mistral models",
|
54 |
+
"parameters": {
|
55 |
+
"temperature": 0.7,
|
56 |
+
"top_k": 50,
|
57 |
+
"top_p": 0.9,
|
58 |
+
"repetition_penalty": 1.1
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"phi": {
|
62 |
+
"name": "Phi",
|
63 |
+
"description": "Configuration for Phi models",
|
64 |
+
"parameters": {
|
65 |
+
"temperature": 0.7,
|
66 |
+
"top_k": 40,
|
67 |
+
"top_p": 0.9,
|
68 |
+
"repetition_penalty": 1.05
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"gpt": {
|
72 |
+
"name": "GPT",
|
73 |
+
"description": "Configuration for GPT models",
|
74 |
+
"parameters": {
|
75 |
+
"temperature": 0.9,
|
76 |
+
"top_k": 0,
|
77 |
+
"top_p": 0.9,
|
78 |
+
"repetition_penalty": 1.0
|
79 |
+
}
|
80 |
+
}
|
81 |
+
}
|
82 |
+
|
83 |
+
# Initialize default configs if they don't exist
|
84 |
+
self._initialize_default_configs()
|
85 |
+
|
86 |
+
def _initialize_default_configs(self):
|
87 |
+
"""Initialize default configurations if they don't exist."""
|
88 |
+
for model_type, config in self.default_configs.items():
|
89 |
+
config_path = os.path.join(self.config_dir, f"{model_type}.json")
|
90 |
+
if not os.path.exists(config_path):
|
91 |
+
with open(config_path, "w") as f:
|
92 |
+
json.dump(config, f, indent=2)
|
93 |
+
|
94 |
+
def get_available_configs(self):
|
95 |
+
"""Get all available model configurations.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
list: List of configuration information dictionaries
|
99 |
+
"""
|
100 |
+
configs = []
|
101 |
+
|
102 |
+
# Read all JSON files in the config directory
|
103 |
+
for filename in os.listdir(self.config_dir):
|
104 |
+
if filename.endswith(".json"):
|
105 |
+
config_path = os.path.join(self.config_dir, filename)
|
106 |
+
try:
|
107 |
+
with open(config_path, "r") as f:
|
108 |
+
config = json.load(f)
|
109 |
+
|
110 |
+
# Add filename (without extension) as ID
|
111 |
+
config_id = os.path.splitext(filename)[0]
|
112 |
+
config["id"] = config_id
|
113 |
+
|
114 |
+
configs.append(config)
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Error loading config {filename}: {e}")
|
117 |
+
|
118 |
+
return configs
|
119 |
+
|
120 |
+
def get_config(self, config_id):
|
121 |
+
"""Get a specific model configuration.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
config_id: Configuration ID (filename without extension)
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
dict: Configuration information or None if not found
|
128 |
+
"""
|
129 |
+
config_path = os.path.join(self.config_dir, f"{config_id}.json")
|
130 |
+
|
131 |
+
if os.path.exists(config_path):
|
132 |
+
try:
|
133 |
+
with open(config_path, "r") as f:
|
134 |
+
config = json.load(f)
|
135 |
+
|
136 |
+
# Add ID to config
|
137 |
+
config["id"] = config_id
|
138 |
+
|
139 |
+
return config
|
140 |
+
except Exception as e:
|
141 |
+
print(f"Error loading config {config_id}: {e}")
|
142 |
+
|
143 |
+
return None
|
144 |
+
|
145 |
+
def add_config(self, name, description, parameters):
|
146 |
+
"""Add a new model configuration.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
name: Configuration name
|
150 |
+
description: Configuration description
|
151 |
+
parameters: Dictionary of configuration parameters
|
152 |
+
|
153 |
+
Returns:
|
154 |
+
str: Configuration ID if successful, None otherwise
|
155 |
+
"""
|
156 |
+
try:
|
157 |
+
# Create a sanitized ID from the name
|
158 |
+
config_id = name.lower().replace(" ", "_").replace("-", "_")
|
159 |
+
|
160 |
+
# Create config object
|
161 |
+
config = {
|
162 |
+
"name": name,
|
163 |
+
"description": description,
|
164 |
+
"parameters": parameters
|
165 |
+
}
|
166 |
+
|
167 |
+
# Save to file
|
168 |
+
config_path = os.path.join(self.config_dir, f"{config_id}.json")
|
169 |
+
with open(config_path, "w") as f:
|
170 |
+
json.dump(config, f, indent=2)
|
171 |
+
|
172 |
+
return config_id
|
173 |
+
except Exception as e:
|
174 |
+
print(f"Error adding config: {e}")
|
175 |
+
return None
|
176 |
+
|
177 |
+
def update_config(self, config_id, name=None, description=None, parameters=None):
|
178 |
+
"""Update an existing model configuration.
|
179 |
+
|
180 |
+
Args:
|
181 |
+
config_id: Configuration ID to update
|
182 |
+
name: New configuration name (optional)
|
183 |
+
description: New configuration description (optional)
|
184 |
+
parameters: New configuration parameters (optional)
|
185 |
+
|
186 |
+
Returns:
|
187 |
+
bool: True if successful, False otherwise
|
188 |
+
"""
|
189 |
+
try:
|
190 |
+
# Get existing config
|
191 |
+
config = self.get_config(config_id)
|
192 |
+
|
193 |
+
if not config:
|
194 |
+
return False
|
195 |
+
|
196 |
+
# Update fields if provided
|
197 |
+
if name:
|
198 |
+
config["name"] = name
|
199 |
+
|
200 |
+
if description:
|
201 |
+
config["description"] = description
|
202 |
+
|
203 |
+
if parameters:
|
204 |
+
config["parameters"] = parameters
|
205 |
+
|
206 |
+
# Remove ID field before saving
|
207 |
+
if "id" in config:
|
208 |
+
del config["id"]
|
209 |
+
|
210 |
+
# Save to file
|
211 |
+
config_path = os.path.join(self.config_dir, f"{config_id}.json")
|
212 |
+
with open(config_path, "w") as f:
|
213 |
+
json.dump(config, f, indent=2)
|
214 |
+
|
215 |
+
return True
|
216 |
+
except Exception as e:
|
217 |
+
print(f"Error updating config {config_id}: {e}")
|
218 |
+
return False
|
219 |
+
|
220 |
+
def delete_config(self, config_id):
|
221 |
+
"""Delete a model configuration.
|
222 |
+
|
223 |
+
Args:
|
224 |
+
config_id: Configuration ID to delete
|
225 |
+
|
226 |
+
Returns:
|
227 |
+
bool: True if successful, False otherwise
|
228 |
+
"""
|
229 |
+
try:
|
230 |
+
# Check if config exists
|
231 |
+
config_path = os.path.join(self.config_dir, f"{config_id}.json")
|
232 |
+
|
233 |
+
if not os.path.exists(config_path):
|
234 |
+
return False
|
235 |
+
|
236 |
+
# Delete file
|
237 |
+
os.remove(config_path)
|
238 |
+
|
239 |
+
return True
|
240 |
+
except Exception as e:
|
241 |
+
print(f"Error deleting config {config_id}: {e}")
|
242 |
+
return False
|
243 |
+
|
244 |
+
def apply_config_to_model_params(self, model_params, config_id):
|
245 |
+
"""Apply a configuration to model parameters.
|
246 |
+
|
247 |
+
Args:
|
248 |
+
model_params: Dictionary of model parameters
|
249 |
+
config_id: Configuration ID to apply
|
250 |
+
|
251 |
+
Returns:
|
252 |
+
dict: Updated model parameters
|
253 |
+
"""
|
254 |
+
# Get the configuration
|
255 |
+
config = self.get_config(config_id)
|
256 |
+
|
257 |
+
if not config or "parameters" not in config:
|
258 |
+
return model_params
|
259 |
+
|
260 |
+
# Apply configuration parameters
|
261 |
+
for param, value in config["parameters"].items():
|
262 |
+
model_params[param] = value
|
263 |
+
|
264 |
+
return model_params
|
265 |
+
|
266 |
+
def create_community_framework_ui(model_config_manager):
|
267 |
+
"""Create the community framework UI components.
|
268 |
+
|
269 |
+
Args:
|
270 |
+
model_config_manager: Model configuration manager instance
|
271 |
+
|
272 |
+
Returns:
|
273 |
+
gr.Blocks: Gradio Blocks component with community framework UI
|
274 |
+
"""
|
275 |
+
with gr.Blocks() as community_ui:
|
276 |
+
gr.Markdown("# 🌐 Dynamic Highscores Community Framework")
|
277 |
+
|
278 |
+
with gr.Tabs() as tabs:
|
279 |
+
with gr.TabItem("About the Framework", id=0):
|
280 |
+
gr.Markdown("""
|
281 |
+
## About Dynamic Highscores
|
282 |
+
|
283 |
+
Dynamic Highscores is an open-source community benchmark system for evaluating language models on any dataset. This project was created to fill the gap left by the retirement of HuggingFace's "Open LLM Leaderboards" which were discontinued due to outdated benchmarks.
|
284 |
+
|
285 |
+
### Key Features
|
286 |
+
|
287 |
+
- **Flexible Benchmarking**: Test models against any HuggingFace dataset, not just predefined benchmarks
|
288 |
+
- **Community-Driven**: Anyone can add new benchmarks and submit models for evaluation
|
289 |
+
- **Modern Evaluation**: Focus on contemporary benchmarks that better reflect current model capabilities
|
290 |
+
- **CPU-Only Evaluation**: Ensures fair comparisons across different models
|
291 |
+
- **Daily Submission Limits**: Prevents system abuse (one benchmark per day per user)
|
292 |
+
- **Model Tagging**: Categorize models as Merge, Agent, Reasoning, Coding, etc.
|
293 |
+
- **Unified Leaderboard**: View all models with filtering capabilities by tags
|
294 |
+
|
295 |
+
### Why This Project Matters
|
296 |
+
|
297 |
+
When HuggingFace retired their "Open LLM Leaderboards," the community lost a valuable resource for comparing model performance. The benchmarks used had become outdated and didn't reflect the rapid advances in language model capabilities.
|
298 |
+
|
299 |
+
Dynamic Highscores addresses this issue by allowing users to select from any benchmark on HuggingFace, including the most recent and relevant datasets. This ensures that models are evaluated on tasks that matter for current applications.
|
300 |
+
|
301 |
+
## How It Works
|
302 |
+
|
303 |
+
1. **Add Benchmarks**: Users can add any dataset from HuggingFace as a benchmark
|
304 |
+
2. **Submit Models**: Submit your HuggingFace model for evaluation against selected benchmarks
|
305 |
+
3. **View Results**: All results appear on the leaderboard, filterable by model type and benchmark
|
306 |
+
4. **Compare Performance**: See how different models perform across various tasks
|
307 |
+
|
308 |
+
## Project Structure
|
309 |
+
|
310 |
+
The codebase is organized into several key components:
|
311 |
+
|
312 |
+
- **app.py**: Main application integrating all components
|
313 |
+
- **auth.py**: Authentication system for HuggingFace login
|
314 |
+
- **benchmark_selection.py**: UI and logic for selecting and adding benchmarks
|
315 |
+
- **database_schema.py**: SQLite database schema for storing benchmarks, models, and results
|
316 |
+
- **evaluation_queue.py**: Queue system for processing model evaluations
|
317 |
+
- **leaderboard.py**: Unified leaderboard with filtering capabilities
|
318 |
+
- **sample_benchmarks.py**: Initial benchmark examples
|
319 |
+
- **model_config.py**: Modular system for model configurations
|
320 |
+
|
321 |
+
## Getting Started
|
322 |
+
|
323 |
+
To use Dynamic Highscores:
|
324 |
+
|
325 |
+
1. Log in with your HuggingFace account
|
326 |
+
2. Browse available benchmarks or add your own
|
327 |
+
3. Submit your model for evaluation
|
328 |
+
4. View results on the leaderboard
|
329 |
+
|
330 |
+
## Contributing to the Project
|
331 |
+
|
332 |
+
We welcome contributions from the community! If you'd like to improve Dynamic Highscores, here are some ways to get involved:
|
333 |
+
|
334 |
+
- **Add New Features**: Enhance the platform with additional functionality
|
335 |
+
- **Improve Evaluation Methods**: Help make model evaluations more accurate and efficient
|
336 |
+
- **Fix Bugs**: Address issues in the codebase
|
337 |
+
- **Enhance Documentation**: Make the project more accessible to new users
|
338 |
+
- **Add Model Configurations**: Contribute optimal configurations for different model types
|
339 |
+
|
340 |
+
To contribute, fork the repository, make your changes, and submit a pull request. We appreciate all contributions, big or small!
|
341 |
+
""")
|
342 |
+
|
343 |
+
with gr.TabItem("Model Configurations", id=1):
|
344 |
+
gr.Markdown("""
|
345 |
+
## Model Configuration System
|
346 |
+
|
347 |
+
The model configuration system allows users to create and apply predefined configurations for different model types. This ensures consistent evaluation settings and helps achieve optimal performance for each model architecture.
|
348 |
+
|
349 |
+
### What Are Model Configurations?
|
350 |
+
|
351 |
+
Model configurations define parameters such as:
|
352 |
+
|
353 |
+
- **Temperature**: Controls randomness in generation
|
354 |
+
- **Top-K**: Limits token selection to top K most likely tokens
|
355 |
+
- **Top-P (nucleus sampling)**: Selects from tokens comprising the top P probability mass
|
356 |
+
- **Min-P**: Sets a minimum probability threshold for token selection
|
357 |
+
- **Repetition Penalty**: Discourages repetitive text
|
358 |
+
|
359 |
+
Different model architectures perform best with different parameter settings. For example, Gemma models typically work well with:
|
360 |
+
|
361 |
+
```
|
362 |
+
Temperature: 1.0
|
363 |
+
Top_K: 64
|
364 |
+
Min_P: 0.01
|
365 |
+
Top_P: 0.95
|
366 |
+
Repetition Penalty: 1.0
|
367 |
+
```
|
368 |
+
|
369 |
+
### Using Model Configurations
|
370 |
+
|
371 |
+
When submitting a model for evaluation, you can select a predefined configuration or create a custom one. The system will apply these parameters during the evaluation process.
|
372 |
+
""")
|
373 |
+
|
374 |
+
with gr.Row():
|
375 |
+
with gr.Column():
|
376 |
+
gr.Markdown("### Available Configurations")
|
377 |
+
config_list = gr.Dataframe(
|
378 |
+
headers=["Name", "Description"],
|
379 |
+
label="Available Configurations",
|
380 |
+
interactive=True
|
381 |
+
)
|
382 |
+
|
383 |
+
refresh_configs_button = gr.Button("Refresh Configurations")
|
384 |
+
|
385 |
+
with gr.Column():
|
386 |
+
selected_config = gr.JSON(label="Configuration Details")
|
387 |
+
|
388 |
+
with gr.Accordion("Add New Configuration", open=False):
|
389 |
+
with gr.Row():
|
390 |
+
with gr.Column():
|
391 |
+
config_name = gr.Textbox(
|
392 |
+
placeholder="Enter a name for this configuration",
|
393 |
+
label="Configuration Name"
|
394 |
+
)
|
395 |
+
|
396 |
+
config_description = gr.Textbox(
|
397 |
+
placeholder="Enter a description for this configuration",
|
398 |
+
label="Description",
|
399 |
+
lines=2
|
400 |
+
)
|
401 |
+
|
402 |
+
with gr.Column():
|
403 |
+
temperature = gr.Slider(
|
404 |
+
minimum=0.0,
|
405 |
+
maximum=2.0,
|
406 |
+
value=0.7,
|
407 |
+
step=0.1,
|
408 |
+
label="Temperature"
|
409 |
+
)
|
410 |
+
|
411 |
+
top_k = gr.Slider(
|
412 |
+
minimum=0,
|
413 |
+
maximum=100,
|
414 |
+
value=50,
|
415 |
+
step=1,
|
416 |
+
label="Top-K"
|
417 |
+
)
|
418 |
+
|
419 |
+
top_p = gr.Slider(
|
420 |
+
minimum=0.0,
|
421 |
+
maximum=1.0,
|
422 |
+
value=0.9,
|
423 |
+
step=0.01,
|
424 |
+
label="Top-P"
|
425 |
+
)
|
426 |
+
|
427 |
+
min_p = gr.Slider(
|
428 |
+
minimum=0.0,
|
429 |
+
maximum=0.5,
|
430 |
+
value=0.01,
|
431 |
+
step=0.01,
|
432 |
+
label="Min-P"
|
433 |
+
)
|
434 |
+
|
435 |
+
repetition_penalty = gr.Slider(
|
436 |
+
minimum=1.0,
|
437 |
+
maximum=2.0,
|
438 |
+
value=1.1,
|
439 |
+
step=0.05,
|
440 |
+
label="Repetition Penalty"
|
441 |
+
)
|
442 |
+
|
443 |
+
add_config_button = gr.Button("Add Configuration")
|
444 |
+
add_config_status = gr.Markdown("")
|
445 |
+
|
446 |
+
with gr.Accordion("Edit Configuration", open=False):
|
447 |
+
with gr.Row():
|
448 |
+
with gr.Column():
|
449 |
+
edit_config_id = gr.Dropdown(
|
450 |
+
choices=[],
|
451 |
+
label="Select Configuration to Edit"
|
452 |
+
)
|
453 |
+
|
454 |
+
edit_config_name = gr.Textbox(
|
455 |
+
label="Configuration Name"
|
456 |
+
)
|
457 |
+
|
458 |
+
edit_config_description = gr.Textbox(
|
459 |
+
label="Description",
|
460 |
+
lines=2
|
461 |
+
)
|
462 |
+
|
463 |
+
with gr.Column():
|
464 |
+
edit_temperature = gr.Slider(
|
465 |
+
minimum=0.0,
|
466 |
+
maximum=2.0,
|
467 |
+
step=0.1,
|
468 |
+
label="Temperature"
|
469 |
+
)
|
470 |
+
|
471 |
+
edit_top_k = gr.Slider(
|
472 |
+
minimum=0,
|
473 |
+
maximum=100,
|
474 |
+
step=1,
|
475 |
+
label="Top-K"
|
476 |
+
)
|
477 |
+
|
478 |
+
edit_top_p = gr.Slider(
|
479 |
+
minimum=0.0,
|
480 |
+
maximum=1.0,
|
481 |
+
step=0.01,
|
482 |
+
label="Top-P"
|
483 |
+
)
|
484 |
+
|
485 |
+
edit_min_p = gr.Slider(
|
486 |
+
minimum=0.0,
|
487 |
+
maximum=0.5,
|
488 |
+
step=0.01,
|
489 |
+
label="Min-P"
|
490 |
+
)
|
491 |
+
|
492 |
+
edit_repetition_penalty = gr.Slider(
|
493 |
+
minimum=1.0,
|
494 |
+
maximum=2.0,
|
495 |
+
step=0.05,
|
496 |
+
label="Repetition Penalty"
|
497 |
+
)
|
498 |
+
|
499 |
+
with gr.Row():
|
500 |
+
update_config_button = gr.Button("Update Configuration")
|
501 |
+
delete_config_button = gr.Button("Delete Configuration", variant="stop")
|
502 |
+
|
503 |
+
edit_config_status = gr.Markdown("")
|
504 |
+
|
505 |
+
with gr.TabItem("Setup Guide", id=2):
|
506 |
+
gr.Markdown("""
|
507 |
+
## Setting Up Dynamic Highscores
|
508 |
+
|
509 |
+
This guide will help you set up your own instance of Dynamic Highscores, whether you're duplicating the Space or running it locally.
|
510 |
+
|
511 |
+
### Duplicating the Space
|
512 |
+
|
513 |
+
The easiest way to get started is to duplicate the HuggingFace Space:
|
514 |
+
|
515 |
+
1. Navigate to the original Dynamic Highscores Space
|
516 |
+
2. Click the "Duplicate this Space" button
|
517 |
+
3. Choose a name for your Space
|
518 |
+
4. Wait for the Space to be created and deployed
|
519 |
+
|
520 |
+
That's it! The system is designed to work out-of-the-box without additional configuration.
|
521 |
+
|
522 |
+
### Running Locally
|
523 |
+
|
524 |
+
To run Dynamic Highscores locally:
|
525 |
+
|
526 |
+
1. Clone the repository:
|
527 |
+
```bash
|
528 |
+
git clone https://huggingface.co/spaces/username/dynamic-highscores
|
529 |
+
cd dynamic-highscores
|
530 |
+
```
|
531 |
+
|
532 |
+
2. Install dependencies:
|
533 |
+
```bash
|
534 |
+
pip install -r requirements.txt
|
535 |
+
```
|
536 |
+
|
537 |
+
3. Run the application:
|
538 |
+
```bash
|
539 |
+
python app.py
|
540 |
+
```
|
541 |
+
|
542 |
+
4. Open your browser and navigate to `http://localhost:7860`
|
543 |
+
|
544 |
+
### Configuration Options
|
545 |
+
|
546 |
+
Dynamic Highscores can be configured through environment variables:
|
547 |
+
|
548 |
+
- `ADMIN_USERNAME`: Username for admin access (default: "Quazim0t0")
|
549 |
+
- `DB_PATH`: Path to SQLite database file (default: "dynamic_highscores.db")
|
550 |
+
- `MEMORY_LIMIT_GB`: Memory limit for model evaluation in GB (default: 14)
|
551 |
+
|
552 |
+
### Adding Sample Benchmarks
|
553 |
+
|
554 |
+
The system comes with sample benchmarks, but you can add more:
|
555 |
+
|
556 |
+
1. Navigate to the "Benchmarks" tab
|
557 |
+
2. Click "Add New Benchmark"
|
558 |
+
3. Enter a HuggingFace dataset ID (e.g., "cais/mmlu", "openai/humaneval")
|
559 |
+
4. Add a name and description
|
560 |
+
5. Select evaluation metrics
|
561 |
+
6. Click "Add as Benchmark"
|
562 |
+
|
563 |
+
### Setting Up OAuth (Advanced)
|
564 |
+
|
565 |
+
If you're running your own instance outside of HuggingFace Spaces, you'll need to set up OAuth:
|
566 |
+
|
567 |
+
1. Create a HuggingFace application at https://huggingface.co/settings/applications
|
568 |
+
2. Set the redirect URI to your application's URL
|
569 |
+
3. Set the following environment variables:
|
570 |
+
```
|
571 |
+
HF_CLIENT_ID=your_client_id
|
572 |
+
HF_CLIENT_SECRET=your_client_secret
|
573 |
+
HF_REDIRECT_URI=your_redirect_uri
|
574 |
+
```
|
575 |
+
|
576 |
+
## Troubleshooting
|
577 |
+
|
578 |
+
### Login Issues
|
579 |
+
|
580 |
+
- Ensure you're logged in to HuggingFace
|
581 |
+
- Check browser console for any errors
|
582 |
+
- Try clearing cookies and cache
|
583 |
+
|
584 |
+
### Evaluation Failures
|
585 |
+
|
586 |
+
- Check model size (must be under memory limit)
|
587 |
+
- Verify dataset exists and is accessible
|
588 |
+
- Check logs for specific error messages
|
589 |
+
|
590 |
+
### Database Issues
|
591 |
+
|
592 |
+
- Ensure the database file is writable
|
593 |
+
- Check for disk space issues
|
594 |
+
- Try backing up and recreating the database
|
595 |
+
""")
|
596 |
+
|
597 |
+
with gr.TabItem("Development Guide", id=3):
|
598 |
+
gr.Markdown("""
|
599 |
+
## Development Guide
|
600 |
+
|
601 |
+
This guide is for developers who want to contribute to the Dynamic Highscores project or extend its functionality.
|
602 |
+
|
603 |
+
### Project Architecture
|
604 |
+
|
605 |
+
Dynamic Highscores follows a modular architecture:
|
606 |
+
|
607 |
+
- **Frontend**: Gradio-based UI components
|
608 |
+
- **Backend**: Python modules for business logic
|
609 |
+
- **Database**: SQLite for data storage
|
610 |
+
- **Evaluation**: CPU-based model evaluation system
|
611 |
+
|
612 |
+
### Key Components
|
613 |
+
|
614 |
+
1. **Authentication System** (auth.py)
|
615 |
+
- Handles HuggingFace OAuth
|
616 |
+
- Manages user sessions
|
617 |
+
- Controls access to features
|
618 |
+
|
619 |
+
2. **Database Schema** (database_schema.py)
|
620 |
+
- Defines tables for benchmarks, models, users, and evaluations
|
621 |
+
- Provides CRUD operations for data management
|
622 |
+
|
623 |
+
3. **Benchmark Selection** (benchmark_selection.py)
|
624 |
+
- UI for browsing and adding benchmarks
|
625 |
+
- Integration with HuggingFace datasets
|
626 |
+
|
627 |
+
4. **Evaluation Queue** (evaluation_queue.py)
|
628 |
+
- Manages model evaluation jobs
|
629 |
+
- Handles CPU-only processing
|
630 |
+
- Implements progress tracking
|
631 |
+
|
632 |
+
5. **Leaderboard** (leaderboard.py)
|
633 |
+
- Displays evaluation results
|
634 |
+
- Provides filtering and sorting
|
635 |
+
- Visualizes performance metrics
|
636 |
+
|
637 |
+
6. **Model Configuration** (model_config.py)
|
638 |
+
- Manages model-specific configurations
|
639 |
+
- Provides parameter presets for different architectures
|
640 |
+
|
641 |
+
### Development Workflow
|
642 |
+
|
643 |
+
1. **Setup Development Environment**
|
644 |
+
```bash
|
645 |
+
git clone https://huggingface.co/spaces/username/dynamic-highscores
|
646 |
+
cd dynamic-highscores
|
647 |
+
pip install -r requirements.txt
|
648 |
+
```
|
649 |
+
|
650 |
+
2. **Make Changes**
|
651 |
+
- Modify code as needed
|
652 |
+
- Add new features or fix bugs
|
653 |
+
- Update documentation
|
654 |
+
|
655 |
+
3. **Test Changes**
|
656 |
+
```bash
|
657 |
+
python test_app.py # Run test suite
|
658 |
+
python app.py # Run application locally
|
659 |
+
```
|
660 |
+
|
661 |
+
4. **Submit Changes**
|
662 |
+
- If you have access, push directly to the repository
|
663 |
+
- Otherwise, submit a pull request with your changes
|
664 |
+
|
665 |
+
### Adding New Features
|
666 |
+
|
667 |
+
To add a new feature to Dynamic Highscores:
|
668 |
+
|
669 |
+
1. **Identify the Component**: Determine which component should contain your feature
|
670 |
+
2. **Implement Backend Logic**: Add necessary functions and classes
|
671 |
+
3. **Create UI Components**: Add Gradio UI elements
|
672 |
+
4. **Connect UI to Backend**: Wire up event handlers
|
673 |
+
5. **Update Documentation**: Document your new feature
|
674 |
+
6. **Test Thoroughly**: Ensure everything works as expected
|
675 |
+
|
676 |
+
### Extending Model Configurations
|
677 |
+
|
678 |
+
To add support for a new model architecture:
|
679 |
+
|
680 |
+
1. Add a new configuration file in the `model_configs` directory
|
681 |
+
2. Define optimal parameters for the architecture
|
682 |
+
3. Update the UI to include the new configuration option
|
683 |
+
|
684 |
+
### Implementing Custom Evaluation Methods
|
685 |
+
|
686 |
+
To add a new evaluation method:
|
687 |
+
|
688 |
+
1. Add a new method to the `EvaluationQueue` class
|
689 |
+
2. Implement the evaluation logic
|
690 |
+
3. Update the `_run_evaluation` method to use your new method
|
691 |
+
4. Add appropriate metrics to the results
|
692 |
+
|
693 |
+
### Best Practices
|
694 |
+
|
695 |
+
- **Keep It Simple**: Favor simplicity over complexity
|
696 |
+
- **Document Everything**: Add docstrings and comments
|
697 |
+
- **Write Tests**: Ensure your code works as expected
|
698 |
+
- **Follow Conventions**: Maintain consistent coding style
|
699 |
+
- **Consider Performance**: Optimize for CPU-based evaluation
|
700 |
+
- **Think About Security**: Protect user data and tokens
|
701 |
+
|
702 |
+
### Getting Help
|
703 |
+
|
704 |
+
If you need assistance with development:
|
705 |
+
|
706 |
+
- Check the existing documentation
|
707 |
+
- Look at the code for similar features
|
708 |
+
- Reach out to the project maintainers
|
709 |
+
- Ask questions in the community forum
|
710 |
+
|
711 |
+
We welcome all contributions and are happy to help new developers get started!
|
712 |
+
""")
|
713 |
+
|
714 |
+
# Event handlers
|
715 |
+
def refresh_configs():
|
716 |
+
configs = model_config_manager.get_available_configs()
|
717 |
+
|
718 |
+
# Format for dataframe
|
719 |
+
formatted_configs = []
|
720 |
+
for config in configs:
|
721 |
+
formatted_configs.append([
|
722 |
+
config["name"],
|
723 |
+
config["description"]
|
724 |
+
])
|
725 |
+
|
726 |
+
# Update dropdown choices for edit
|
727 |
+
config_choices = [(c["id"], c["name"]) for c in configs]
|
728 |
+
|
729 |
+
return formatted_configs, gr.update(choices=config_choices)
|
730 |
+
|
731 |
+
def view_config(evt: gr.SelectData, configs):
|
732 |
+
if evt.index[0] < len(configs):
|
733 |
+
config_name = configs[evt.index[0]][0]
|
734 |
+
|
735 |
+
# Find config by name
|
736 |
+
all_configs = model_config_manager.get_available_configs()
|
737 |
+
selected = None
|
738 |
+
|
739 |
+
for config in all_configs:
|
740 |
+
if config["name"] == config_name:
|
741 |
+
selected = config
|
742 |
+
break
|
743 |
+
|
744 |
+
if selected:
|
745 |
+
return selected
|
746 |
+
|
747 |
+
return None
|
748 |
+
|
749 |
+
def add_config_handler(name, description, temperature, top_k, top_p, min_p, repetition_penalty):
|
750 |
+
if not name:
|
751 |
+
return "Please enter a name for the configuration."
|
752 |
+
|
753 |
+
# Create parameters dictionary
|
754 |
+
parameters = {
|
755 |
+
"temperature": temperature,
|
756 |
+
"top_k": top_k,
|
757 |
+
"top_p": top_p,
|
758 |
+
"min_p": min_p,
|
759 |
+
"repetition_penalty": repetition_penalty
|
760 |
+
}
|
761 |
+
|
762 |
+
# Add configuration
|
763 |
+
config_id = model_config_manager.add_config(name, description, parameters)
|
764 |
+
|
765 |
+
if config_id:
|
766 |
+
return f"✅ Configuration '{name}' added successfully."
|
767 |
+
else:
|
768 |
+
return "❌ Failed to add configuration."
|
769 |
+
|
770 |
+
def load_config_for_edit(config_id):
|
771 |
+
if not config_id:
|
772 |
+
return [gr.update() for _ in range(7)]
|
773 |
+
|
774 |
+
config = model_config_manager.get_config(config_id)
|
775 |
+
|
776 |
+
if not config:
|
777 |
+
return [gr.update() for _ in range(7)]
|
778 |
+
|
779 |
+
# Extract parameters with defaults
|
780 |
+
params = config.get("parameters", {})
|
781 |
+
temperature = params.get("temperature", 0.7)
|
782 |
+
top_k = params.get("top_k", 50)
|
783 |
+
top_p = params.get("top_p", 0.9)
|
784 |
+
min_p = params.get("min_p", 0.01)
|
785 |
+
repetition_penalty = params.get("repetition_penalty", 1.1)
|
786 |
+
|
787 |
+
return [
|
788 |
+
gr.update(value=config["name"]),
|
789 |
+
gr.update(value=config.get("description", "")),
|
790 |
+
gr.update(value=temperature),
|
791 |
+
gr.update(value=top_k),
|
792 |
+
gr.update(value=top_p),
|
793 |
+
gr.update(value=min_p),
|
794 |
+
gr.update(value=repetition_penalty)
|
795 |
+
]
|
796 |
+
|
797 |
+
def update_config_handler(config_id, name, description, temperature, top_k, top_p, min_p, repetition_penalty):
|
798 |
+
if not config_id:
|
799 |
+
return "Please select a configuration to update."
|
800 |
+
|
801 |
+
# Create parameters dictionary
|
802 |
+
parameters = {
|
803 |
+
"temperature": temperature,
|
804 |
+
"top_k": top_k,
|
805 |
+
"top_p": top_p,
|
806 |
+
"min_p": min_p,
|
807 |
+
"repetition_penalty": repetition_penalty
|
808 |
+
}
|
809 |
+
|
810 |
+
# Update configuration
|
811 |
+
success = model_config_manager.update_config(config_id, name, description, parameters)
|
812 |
+
|
813 |
+
if success:
|
814 |
+
return f"✅ Configuration '{name}' updated successfully."
|
815 |
+
else:
|
816 |
+
return "❌ Failed to update configuration."
|
817 |
+
|
818 |
+
def delete_config_handler(config_id):
|
819 |
+
if not config_id:
|
820 |
+
return "Please select a configuration to delete."
|
821 |
+
|
822 |
+
# Delete configuration
|
823 |
+
success = model_config_manager.delete_config(config_id)
|
824 |
+
|
825 |
+
if success:
|
826 |
+
return f"✅ Configuration deleted successfully."
|
827 |
+
else:
|
828 |
+
return "❌ Failed to delete configuration."
|
829 |
+
|
830 |
+
# Connect event handlers
|
831 |
+
refresh_configs_button.click(
|
832 |
+
fn=refresh_configs,
|
833 |
+
inputs=[],
|
834 |
+
outputs=[config_list, edit_config_id]
|
835 |
+
)
|
836 |
+
|
837 |
+
config_list.select(
|
838 |
+
fn=view_config,
|
839 |
+
inputs=[config_list],
|
840 |
+
outputs=[selected_config]
|
841 |
+
)
|
842 |
+
|
843 |
+
add_config_button.click(
|
844 |
+
fn=add_config_handler,
|
845 |
+
inputs=[config_name, config_description, temperature, top_k, top_p, min_p, repetition_penalty],
|
846 |
+
outputs=[add_config_status]
|
847 |
+
)
|
848 |
+
|
849 |
+
edit_config_id.change(
|
850 |
+
fn=load_config_for_edit,
|
851 |
+
inputs=[edit_config_id],
|
852 |
+
outputs=[edit_config_name, edit_config_description, edit_temperature, edit_top_k, edit_top_p, edit_min_p, edit_repetition_penalty]
|
853 |
+
)
|
854 |
+
|
855 |
+
update_config_button.click(
|
856 |
+
fn=update_config_handler,
|
857 |
+
inputs=[edit_config_id, edit_config_name, edit_config_description, edit_temperature, edit_top_k, edit_top_p, edit_min_p, edit_repetition_penalty],
|
858 |
+
outputs=[edit_config_status]
|
859 |
+
)
|
860 |
+
|
861 |
+
delete_config_button.click(
|
862 |
+
fn=delete_config_handler,
|
863 |
+
inputs=[edit_config_id],
|
864 |
+
outputs=[edit_config_status]
|
865 |
+
)
|
866 |
+
|
867 |
+
# Load configurations on page load
|
868 |
+
community_ui.load(
|
869 |
+
fn=refresh_configs,
|
870 |
+
inputs=[],
|
871 |
+
outputs=[config_list, edit_config_id]
|
872 |
+
)
|
873 |
+
|
874 |
+
return community_ui
|
model_configs/gemma.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "Gemma",
|
3 |
+
"description": "Configuration for Gemma models",
|
4 |
+
"parameters": {
|
5 |
+
"temperature": 1.0,
|
6 |
+
"top_k": 64,
|
7 |
+
"min_p": 0.01,
|
8 |
+
"top_p": 0.95,
|
9 |
+
"repetition_penalty": 1.0
|
10 |
+
}
|
11 |
+
}
|
model_configs/llama.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "LLaMA",
|
3 |
+
"description": "Configuration for LLaMA models",
|
4 |
+
"parameters": {
|
5 |
+
"temperature": 0.8,
|
6 |
+
"top_k": 40,
|
7 |
+
"top_p": 0.9,
|
8 |
+
"repetition_penalty": 1.1
|
9 |
+
}
|
10 |
+
}
|
model_configs/mistral.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "Mistral",
|
3 |
+
"description": "Configuration for Mistral models",
|
4 |
+
"parameters": {
|
5 |
+
"temperature": 0.7,
|
6 |
+
"top_k": 50,
|
7 |
+
"top_p": 0.9,
|
8 |
+
"repetition_penalty": 1.1
|
9 |
+
}
|
10 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.4.0
|
2 |
+
huggingface-hub>=0.27.1
|
3 |
+
datasets>=2.14.5
|
4 |
+
transformers>=4.35.2
|
5 |
+
torch>=2.0.0
|
6 |
+
pandas>=2.0.0
|
7 |
+
numpy>=1.24.2
|
8 |
+
plotly>=5.13.0
|
9 |
+
APScheduler>=3.10.1
|
10 |
+
tqdm>=4.65.0
|
11 |
+
requests>=2.28.2
|
12 |
+
python-dateutil>=2.8.2
|
sample_benchmarks.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Sample benchmarks initialization for Dynamic Highscores system.
|
3 |
+
|
4 |
+
This script adds sample benchmarks to the database to provide initial options for users.
|
5 |
+
"""
|
6 |
+
|
7 |
+
from database_schema import DynamicHighscoresDB
|
8 |
+
|
9 |
+
def add_sample_benchmarks():
|
10 |
+
"""Add sample benchmarks to the database."""
|
11 |
+
# Initialize database
|
12 |
+
db = DynamicHighscoresDB()
|
13 |
+
|
14 |
+
# Sample benchmarks to add
|
15 |
+
sample_benchmarks = [
|
16 |
+
{
|
17 |
+
"name": "MMLU (Massive Multitask Language Understanding)",
|
18 |
+
"dataset_id": "cais/mmlu",
|
19 |
+
"description": "A benchmark for measuring massive multitask language understanding across 57 tasks including elementary mathematics, US history, computer science, law, and more.",
|
20 |
+
"metrics": {"accuracy": 1.0, "consistency": 1.0}
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"name": "HumanEval (Code Generation)",
|
24 |
+
"dataset_id": "openai/humaneval",
|
25 |
+
"description": "A benchmark for evaluating language models on code generation tasks. It consists of 164 programming problems with unit tests.",
|
26 |
+
"metrics": {"pass@1": 1.0, "functional_correctness": 1.0}
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"name": "HellaSwag (Commonsense Reasoning)",
|
30 |
+
"dataset_id": "hellaswag",
|
31 |
+
"description": "A challenge dataset for evaluating commonsense natural language inference. It consists of multiple-choice questions about grounded situations.",
|
32 |
+
"metrics": {"accuracy": 1.0}
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"name": "GSM8K (Grade School Math)",
|
36 |
+
"dataset_id": "gsm8k",
|
37 |
+
"description": "A dataset of 8.5K high quality grade school math word problems. These problems take between 2 and 8 steps to solve, and solutions primarily involve performing a sequence of elementary calculations using basic arithmetic operations.",
|
38 |
+
"metrics": {"accuracy": 1.0, "correct_steps": 1.0}
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"name": "TruthfulQA",
|
42 |
+
"dataset_id": "truthful_qa",
|
43 |
+
"description": "A benchmark to measure whether a language model is truthful in generating answers to questions. The benchmark comprises 817 questions that span 38 categories, including health, law, finance and politics.",
|
44 |
+
"metrics": {"accuracy": 1.0, "truthfulness": 1.0}
|
45 |
+
}
|
46 |
+
]
|
47 |
+
|
48 |
+
# Add each benchmark to the database
|
49 |
+
added_count = 0
|
50 |
+
for benchmark in sample_benchmarks:
|
51 |
+
try:
|
52 |
+
benchmark_id = db.add_benchmark(
|
53 |
+
name=benchmark["name"],
|
54 |
+
dataset_id=benchmark["dataset_id"],
|
55 |
+
description=benchmark["description"],
|
56 |
+
metrics=benchmark["metrics"]
|
57 |
+
)
|
58 |
+
|
59 |
+
if benchmark_id:
|
60 |
+
print(f"Added benchmark '{benchmark['name']}' with ID: {benchmark_id}")
|
61 |
+
added_count += 1
|
62 |
+
except Exception as e:
|
63 |
+
print(f"Error adding benchmark '{benchmark['name']}': {e}")
|
64 |
+
|
65 |
+
# Close database connection
|
66 |
+
db.close()
|
67 |
+
|
68 |
+
return added_count
|
69 |
+
|
70 |
+
if __name__ == "__main__":
|
71 |
+
num_added = add_sample_benchmarks()
|
72 |
+
print(f"Added {num_added} sample benchmarks to the database.")
|
space.yaml
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Deployment configuration for Dynamic Highscores on HuggingFace Spaces.
|
3 |
+
|
4 |
+
This file configures the application for deployment on HuggingFace Spaces.
|
5 |
+
"""
|
6 |
+
|
7 |
+
sdk_version: 3.0.0
|
8 |
+
app_file: app.py
|
9 |
+
models:
|
10 |
+
- huggingface-hub
|
11 |
+
- transformers
|
12 |
+
- datasets
|
13 |
+
- torch
|
14 |
+
- gradio
|
15 |
+
- pandas
|
16 |
+
- plotly
|
17 |
+
- apscheduler
|
18 |
+
- tqdm
|
19 |
+
- requests
|
20 |
+
- python-dateutil
|
21 |
+
- numpy
|
22 |
+
python_version: 3.10.12
|
23 |
+
hf_oauth: true
|
24 |
+
env:
|
25 |
+
- OAUTH_CLIENT_ID=$OAUTH_CLIENT_ID
|
26 |
+
- OAUTH_CLIENT_SECRET=$OAUTH_CLIENT_SECRET
|
27 |
+
- OAUTH_AUTHORIZATION_URL=$OAUTH_AUTHORIZATION_URL
|
28 |
+
- OAUTH_TOKEN_URL=$OAUTH_TOKEN_URL
|
29 |
+
oauth_scopes:
|
30 |
+
- inference
|
test_app.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test script for Dynamic Highscores application.
|
3 |
+
|
4 |
+
This script tests the key functionality of the Dynamic Highscores application
|
5 |
+
to ensure everything works as expected before deployment.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import unittest
|
10 |
+
import tempfile
|
11 |
+
import sqlite3
|
12 |
+
from unittest.mock import MagicMock, patch
|
13 |
+
|
14 |
+
# Import components to test
|
15 |
+
from database_schema import DynamicHighscoresDB
|
16 |
+
from auth import HuggingFaceAuth
|
17 |
+
from benchmark_selection import BenchmarkSelector
|
18 |
+
from evaluation_queue import EvaluationQueue
|
19 |
+
from leaderboard import Leaderboard
|
20 |
+
|
21 |
+
class TestDynamicHighscores(unittest.TestCase):
|
22 |
+
"""Test cases for Dynamic Highscores application."""
|
23 |
+
|
24 |
+
def setUp(self):
|
25 |
+
"""Set up test environment."""
|
26 |
+
# Create temporary database
|
27 |
+
self.db_fd, self.db_path = tempfile.mkstemp()
|
28 |
+
self.db = DynamicHighscoresDB(self.db_path)
|
29 |
+
|
30 |
+
# Mock auth manager
|
31 |
+
self.auth_manager = HuggingFaceAuth(self.db)
|
32 |
+
|
33 |
+
# Mock components
|
34 |
+
self.benchmark_selector = BenchmarkSelector(self.db, self.auth_manager)
|
35 |
+
self.evaluation_queue = EvaluationQueue(self.db, self.auth_manager)
|
36 |
+
self.leaderboard = Leaderboard(self.db)
|
37 |
+
|
38 |
+
def tearDown(self):
|
39 |
+
"""Clean up test environment."""
|
40 |
+
os.close(self.db_fd)
|
41 |
+
os.unlink(self.db_path)
|
42 |
+
|
43 |
+
def test_database_schema(self):
|
44 |
+
"""Test database schema creation."""
|
45 |
+
# Check if tables were created
|
46 |
+
conn = sqlite3.connect(self.db_path)
|
47 |
+
cursor = conn.cursor()
|
48 |
+
|
49 |
+
# Get list of tables
|
50 |
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
51 |
+
tables = cursor.fetchall()
|
52 |
+
table_names = [table[0] for table in tables]
|
53 |
+
|
54 |
+
# Check if all expected tables exist
|
55 |
+
expected_tables = ['users', 'benchmarks', 'models', 'evaluations', 'queue']
|
56 |
+
for table in expected_tables:
|
57 |
+
self.assertIn(table, table_names)
|
58 |
+
|
59 |
+
conn.close()
|
60 |
+
|
61 |
+
def test_user_management(self):
|
62 |
+
"""Test user management functionality."""
|
63 |
+
# Add a test user
|
64 |
+
user_id = self.db.add_user("test_user", "test_hf_id", False)
|
65 |
+
self.assertIsNotNone(user_id)
|
66 |
+
|
67 |
+
# Add an admin user
|
68 |
+
admin_id = self.db.add_user("admin_user", "admin_hf_id", True)
|
69 |
+
self.assertIsNotNone(admin_id)
|
70 |
+
|
71 |
+
# Test submission limits
|
72 |
+
self.assertTrue(self.db.can_submit_today(user_id))
|
73 |
+
self.db.update_submission_date(user_id)
|
74 |
+
self.assertFalse(self.db.can_submit_today(user_id))
|
75 |
+
|
76 |
+
# Admin should always be able to submit
|
77 |
+
self.assertTrue(self.db.can_submit_today(admin_id))
|
78 |
+
|
79 |
+
def test_benchmark_management(self):
|
80 |
+
"""Test benchmark management functionality."""
|
81 |
+
# Add a test benchmark
|
82 |
+
benchmark_id = self.db.add_benchmark(
|
83 |
+
name="Test Benchmark",
|
84 |
+
dataset_id="test/dataset",
|
85 |
+
description="Test description",
|
86 |
+
metrics={"accuracy": 1.0}
|
87 |
+
)
|
88 |
+
self.assertIsNotNone(benchmark_id)
|
89 |
+
|
90 |
+
# Get benchmarks
|
91 |
+
benchmarks = self.db.get_benchmarks()
|
92 |
+
self.assertEqual(len(benchmarks), 1)
|
93 |
+
self.assertEqual(benchmarks[0]["name"], "Test Benchmark")
|
94 |
+
|
95 |
+
def test_model_management(self):
|
96 |
+
"""Test model management functionality."""
|
97 |
+
# Add a test user
|
98 |
+
user_id = self.db.add_user("test_user", "test_hf_id", False)
|
99 |
+
|
100 |
+
# Add a test model
|
101 |
+
model_id = self.db.add_model(
|
102 |
+
name="Test Model",
|
103 |
+
hf_model_id="test/model",
|
104 |
+
user_id=user_id,
|
105 |
+
tag="Reasoning",
|
106 |
+
parameters="7B",
|
107 |
+
description="Test model description"
|
108 |
+
)
|
109 |
+
self.assertIsNotNone(model_id)
|
110 |
+
|
111 |
+
# Get models
|
112 |
+
models = self.db.get_models()
|
113 |
+
self.assertEqual(len(models), 1)
|
114 |
+
self.assertEqual(models[0]["name"], "Test Model")
|
115 |
+
|
116 |
+
# Get models by tag
|
117 |
+
models = self.db.get_models(tag="Reasoning")
|
118 |
+
self.assertEqual(len(models), 1)
|
119 |
+
self.assertEqual(models[0]["tag"], "Reasoning")
|
120 |
+
|
121 |
+
def test_evaluation_management(self):
|
122 |
+
"""Test evaluation management functionality."""
|
123 |
+
# Add a test user
|
124 |
+
user_id = self.db.add_user("test_user", "test_hf_id", False)
|
125 |
+
|
126 |
+
# Add a test model
|
127 |
+
model_id = self.db.add_model(
|
128 |
+
name="Test Model",
|
129 |
+
hf_model_id="test/model",
|
130 |
+
user_id=user_id,
|
131 |
+
tag="Reasoning"
|
132 |
+
)
|
133 |
+
|
134 |
+
# Add a test benchmark
|
135 |
+
benchmark_id = self.db.add_benchmark(
|
136 |
+
name="Test Benchmark",
|
137 |
+
dataset_id="test/dataset"
|
138 |
+
)
|
139 |
+
|
140 |
+
# Add a test evaluation
|
141 |
+
evaluation_id = self.db.add_evaluation(
|
142 |
+
model_id=model_id,
|
143 |
+
benchmark_id=benchmark_id
|
144 |
+
)
|
145 |
+
self.assertIsNotNone(evaluation_id)
|
146 |
+
|
147 |
+
# Update evaluation status
|
148 |
+
self.db.update_evaluation_status(
|
149 |
+
evaluation_id=evaluation_id,
|
150 |
+
status="running"
|
151 |
+
)
|
152 |
+
|
153 |
+
# Get next in queue
|
154 |
+
next_eval = self.db.get_next_in_queue()
|
155 |
+
self.assertIsNotNone(next_eval)
|
156 |
+
self.assertEqual(next_eval["evaluation_id"], evaluation_id)
|
157 |
+
|
158 |
+
# Complete evaluation
|
159 |
+
self.db.update_evaluation_status(
|
160 |
+
evaluation_id=evaluation_id,
|
161 |
+
status="completed",
|
162 |
+
results={"accuracy": 0.85},
|
163 |
+
score=85.0
|
164 |
+
)
|
165 |
+
|
166 |
+
# Get evaluation results
|
167 |
+
results = self.db.get_evaluation_results()
|
168 |
+
self.assertEqual(len(results), 1)
|
169 |
+
self.assertEqual(results[0]["score"], 85.0)
|
170 |
+
|
171 |
+
def test_leaderboard(self):
|
172 |
+
"""Test leaderboard functionality."""
|
173 |
+
# Add test data
|
174 |
+
user_id = self.db.add_user("test_user", "test_hf_id", False)
|
175 |
+
|
176 |
+
# Add models with different tags
|
177 |
+
model1_id = self.db.add_model(
|
178 |
+
name="Model 1",
|
179 |
+
hf_model_id="test/model1",
|
180 |
+
user_id=user_id,
|
181 |
+
tag="Reasoning"
|
182 |
+
)
|
183 |
+
|
184 |
+
model2_id = self.db.add_model(
|
185 |
+
name="Model 2",
|
186 |
+
hf_model_id="test/model2",
|
187 |
+
user_id=user_id,
|
188 |
+
tag="Coding"
|
189 |
+
)
|
190 |
+
|
191 |
+
# Add a benchmark
|
192 |
+
benchmark_id = self.db.add_benchmark(
|
193 |
+
name="Test Benchmark",
|
194 |
+
dataset_id="test/dataset"
|
195 |
+
)
|
196 |
+
|
197 |
+
# Add evaluations
|
198 |
+
eval1_id = self.db.add_evaluation(
|
199 |
+
model_id=model1_id,
|
200 |
+
benchmark_id=benchmark_id
|
201 |
+
)
|
202 |
+
|
203 |
+
eval2_id = self.db.add_evaluation(
|
204 |
+
model_id=model2_id,
|
205 |
+
benchmark_id=benchmark_id
|
206 |
+
)
|
207 |
+
|
208 |
+
# Complete evaluations
|
209 |
+
self.db.update_evaluation_status(
|
210 |
+
evaluation_id=eval1_id,
|
211 |
+
status="completed",
|
212 |
+
results={"accuracy": 0.9},
|
213 |
+
score=90.0
|
214 |
+
)
|
215 |
+
|
216 |
+
self.db.update_evaluation_status(
|
217 |
+
evaluation_id=eval2_id,
|
218 |
+
status="completed",
|
219 |
+
results={"accuracy": 0.8},
|
220 |
+
score=80.0
|
221 |
+
)
|
222 |
+
|
223 |
+
# Get leaderboard data
|
224 |
+
df = self.leaderboard.get_leaderboard_data()
|
225 |
+
self.assertEqual(len(df), 2)
|
226 |
+
|
227 |
+
# Test filtering by tag
|
228 |
+
df_reasoning = self.leaderboard.get_leaderboard_data(tag="Reasoning")
|
229 |
+
self.assertEqual(len(df_reasoning), 1)
|
230 |
+
self.assertEqual(df_reasoning.iloc[0]["score"], 90.0)
|
231 |
+
|
232 |
+
df_coding = self.leaderboard.get_leaderboard_data(tag="Coding")
|
233 |
+
self.assertEqual(len(df_coding), 1)
|
234 |
+
self.assertEqual(df_coding.iloc[0]["score"], 80.0)
|
235 |
+
|
236 |
+
if __name__ == "__main__":
|
237 |
+
unittest.main()
|
todo.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Dynamic Highscores - Todo List
|
2 |
+
|
3 |
+
## Analysis and Planning
|
4 |
+
- [x] Extract and analyze uploaded framework files
|
5 |
+
- [x] Examine leaderboard component structure and functionality
|
6 |
+
- [x] Examine dashboard component structure and functionality
|
7 |
+
- [x] Analyze requirements.txt files for dependencies
|
8 |
+
|
9 |
+
## Database Schema Design
|
10 |
+
- [x] Design schema for user authentication and tracking
|
11 |
+
- [x] Design schema for benchmark datasets
|
12 |
+
- [x] Design schema for model submissions and evaluations
|
13 |
+
- [x] Design schema for tagging system (Merge, Agent, Reasoning, Coding, etc.)
|
14 |
+
- [x] Design schema for daily submission limits
|
15 |
+
|
16 |
+
## User Authentication System
|
17 |
+
- [x] Implement HuggingFace login integration
|
18 |
+
- [x] Create user profile management
|
19 |
+
- [x] Implement special privileges for admin account
|
20 |
+
|
21 |
+
## Benchmark Selection Interface
|
22 |
+
- [x] Create interface for browsing HuggingFace datasets
|
23 |
+
- [x] Implement dataset loading functionality
|
24 |
+
- [x] Create dataset preview and selection UI
|
25 |
+
|
26 |
+
## Model Evaluation Queue System
|
27 |
+
- [x] Implement CPU-only evaluation system
|
28 |
+
- [x] Create queue management for benchmark submissions
|
29 |
+
- [x] Implement daily submission limit (1 per day per user)
|
30 |
+
- [x] Add admin override for submission limits
|
31 |
+
|
32 |
+
## Leaderboard with Filtering
|
33 |
+
- [x] Implement unified leaderboard for all models
|
34 |
+
- [x] Add tag-based filtering (Merge, Agent, Reasoning, Coding)
|
35 |
+
- [x] Implement sorting and searching functionality
|
36 |
+
- [x] Create visualization components for benchmark results
|
37 |
+
|
38 |
+
## Integration
|
39 |
+
- [x] Combine dashboard and leaderboard components
|
40 |
+
- [x] Create unified UI with consistent styling
|
41 |
+
- [x] Implement navigation between different sections
|
42 |
+
- [x] Ensure proper data flow between components
|
43 |
+
|
44 |
+
## Testing and Deployment
|
45 |
+
- [x] Test user authentication flow
|
46 |
+
- [x] Test benchmark selection and submission
|
47 |
+
- [x] Test leaderboard filtering and visualization
|
48 |
+
- [x] Prepare for deployment on HuggingFace Spaces
|