Quazim0t0 commited on
Commit
72e8234
·
verified ·
1 Parent(s): 3582217

Update evaluation_queue.py

Browse files
Files changed (1) hide show
  1. evaluation_queue.py +797 -3
evaluation_queue.py CHANGED
@@ -1,8 +1,802 @@
1
  """
2
- Updated create_model_submission_ui function that properly displays benchmark names in dropdown.
3
- Replace this function in your evaluation_queue.py file.
 
 
4
  """
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
7
  """Create the model submission UI components.
8
 
@@ -107,7 +901,7 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
107
 
108
  def refresh_benchmarks_handler():
109
  benchmarks = db_manager.get_benchmarks()
110
-
111
  # Format for dropdown - properly formatted to display names
112
  choices = []
113
  for b in benchmarks:
 
1
  """
2
+ Model evaluation queue system for Dynamic Highscores.
3
+
4
+ This module handles the evaluation queue, CPU-only processing,
5
+ and enforces daily submission limits for users.
6
  """
7
 
8
+ import os
9
+ import json
10
+ import time
11
+ import threading
12
+ import queue as queue_module
13
+ from datetime import datetime, timedelta
14
+ import gradio as gr
15
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download
16
+ from datasets import load_dataset
17
+ import torch
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
+ import sqlite3
20
+
21
+ class EvaluationQueue:
22
+ """Manages the evaluation queue for model benchmarking."""
23
+
24
+ def __init__(self, db_manager, auth_manager):
25
+ """Initialize the evaluation queue manager.
26
+
27
+ Args:
28
+ db_manager: Database manager instance
29
+ auth_manager: Authentication manager instance
30
+ """
31
+ self.db_manager = db_manager
32
+ self.auth_manager = auth_manager
33
+ self.hf_api = HfApi()
34
+ self.queue = queue_module.Queue()
35
+ self.is_processing = False
36
+ self.worker_thread = None
37
+ self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
38
+ self.current_evaluation = None
39
+ self.progress = 0
40
+ self.progress_lock = threading.Lock()
41
+ # Memory limit for models in GB (leave 2GB for system)
42
+ self.memory_limit_gb = 14.0
43
+
44
+ def start_worker(self):
45
+ """Start the worker thread for processing the evaluation queue."""
46
+ if self.worker_thread is None or not self.worker_thread.is_alive():
47
+ self.is_processing = True
48
+ self.worker_thread = threading.Thread(target=self._process_queue)
49
+ self.worker_thread.daemon = True
50
+ self.worker_thread.start()
51
+
52
+ def stop_worker(self):
53
+ """Stop the worker thread."""
54
+ self.is_processing = False
55
+ if self.worker_thread and self.worker_thread.is_alive():
56
+ self.worker_thread.join(timeout=1.0)
57
+
58
+ def check_model_size(self, model_id):
59
+ """Check if a model will fit within RAM limitations.
60
+
61
+ Args:
62
+ model_id: HuggingFace model ID
63
+
64
+ Returns:
65
+ tuple: (will_fit, message)
66
+ """
67
+ try:
68
+ # Query model info from the HuggingFace API
69
+ model_info_obj = self.hf_api.model_info(model_id)
70
+
71
+ # Check if model size information is available
72
+ if hasattr(model_info_obj, 'safetensors') and model_info_obj.safetensors:
73
+ # Calculate size in GB (divided by 1024^3)
74
+ total_size_gb = sum(
75
+ file.size for file in model_info_obj.safetensors.values()
76
+ ) / (1024 * 1024 * 1024)
77
+ elif hasattr(model_info_obj, 'siblings'):
78
+ # Legacy method - calculate from file siblings
79
+ total_size_gb = sum(
80
+ sibling.size for sibling in model_info_obj.siblings
81
+ if sibling.rfilename.endswith(('.bin', '.safetensors', '.pt'))
82
+ ) / (1024 * 1024 * 1024)
83
+ else:
84
+ # Can't determine size
85
+ return False, "Unable to determine model size. Please ensure model is under 14GB."
86
+
87
+ # Account for memory overhead (tokenizer, processing, etc.)
88
+ estimated_ram_needed = total_size_gb * 1.3 # 30% overhead
89
+
90
+ # Check against limit
91
+ if estimated_ram_needed > self.memory_limit_gb:
92
+ return False, f"Model is too large (approximately {total_size_gb:.1f}GB, needs {estimated_ram_needed:.1f}GB RAM). Maximum allowed is {self.memory_limit_gb}GB."
93
+
94
+ return True, f"Model size check passed ({total_size_gb:.1f}GB, estimated {estimated_ram_needed:.1f}GB RAM usage)"
95
+
96
+ except Exception as e:
97
+ print(f"Model size check error: {e}")
98
+ # If we can't check, be cautious
99
+ return False, f"Error checking model size: {str(e)}. Please ensure your model is under {self.memory_limit_gb}GB."
100
+
101
+ def _process_queue(self):
102
+ """Process the evaluation queue in a separate thread."""
103
+ while self.is_processing:
104
+ try:
105
+ # Get the next evaluation from the database
106
+ pending_evals = self.db_manager.get_evaluation_results(status="pending")
107
+
108
+ if pending_evals:
109
+ # Sort by priority and added_at
110
+ next_eval = pending_evals[0]
111
+
112
+ # Update status to running
113
+ self.db_manager.update_evaluation_status(next_eval['id'], 'running')
114
+
115
+ # Set current evaluation and reset progress
116
+ with self.progress_lock:
117
+ self.current_evaluation = next_eval
118
+ self.progress = 0
119
+
120
+ try:
121
+ # Get model and benchmark details
122
+ model_info = self.db_manager.get_model(next_eval['model_id'])
123
+ benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
124
+
125
+ if model_info and benchmark_info:
126
+ # Check if model will fit in memory
127
+ will_fit, message = self.check_model_size(model_info['hf_model_id'])
128
+
129
+ if not will_fit:
130
+ raise Exception(f"Model too large for evaluation: {message}")
131
+
132
+ # Run the evaluation
133
+ results = self._run_evaluation(
134
+ model_info['hf_model_id'],
135
+ benchmark_info['dataset_id']
136
+ )
137
+
138
+ # Calculate overall score
139
+ score = self._calculate_overall_score(results)
140
+
141
+ # Update status to completed with results
142
+ self.db_manager.update_evaluation_status(
143
+ next_eval['id'],
144
+ 'completed',
145
+ results=results,
146
+ score=score
147
+ )
148
+ else:
149
+ raise Exception("Model or benchmark not found")
150
+ except Exception as e:
151
+ print(f"Evaluation error: {e}")
152
+ # Update status to failed with error message
153
+ error_results = {"error": str(e)}
154
+ self.db_manager.update_evaluation_status(
155
+ next_eval['id'],
156
+ 'failed',
157
+ results=error_results
158
+ )
159
+
160
+ # Clear current evaluation
161
+ with self.progress_lock:
162
+ self.current_evaluation = None
163
+ self.progress = 0
164
+ else:
165
+ # No evaluations in queue, sleep for a bit
166
+ time.sleep(5)
167
+ except Exception as e:
168
+ print(f"Queue processing error: {e}")
169
+ time.sleep(5)
170
+
171
+ def _run_evaluation(self, model_id, dataset_id):
172
+ """Run an evaluation for a model on a benchmark.
173
+
174
+ Args:
175
+ model_id: HuggingFace model ID
176
+ dataset_id: HuggingFace dataset ID (with optional config)
177
+
178
+ Returns:
179
+ dict: Evaluation results
180
+ """
181
+ # Update progress
182
+ with self.progress_lock:
183
+ self.progress = 5 # Starting evaluation
184
+
185
+ # Parse dataset ID and config
186
+ if ":" in dataset_id:
187
+ dataset_id, config = dataset_id.split(":", 1)
188
+ else:
189
+ config = None
190
+
191
+ # Update progress
192
+ with self.progress_lock:
193
+ self.progress = 10 # Loading dataset
194
+
195
+ # Load the dataset
196
+ try:
197
+ if config:
198
+ dataset = load_dataset(dataset_id, config, split="test")
199
+ else:
200
+ dataset = load_dataset(dataset_id, split="test")
201
+ except Exception as e:
202
+ return {"error": f"Failed to load dataset: {str(e)}"}
203
+
204
+ # Update progress
205
+ with self.progress_lock:
206
+ self.progress = 20 # Loading model
207
+
208
+ try:
209
+ # Load the model with memory optimization settings
210
+ device = "cpu"
211
+ model = AutoModelForCausalLM.from_pretrained(
212
+ model_id,
213
+ device_map=device,
214
+ torch_dtype=torch.float32, # Use float32 for CPU
215
+ low_cpu_mem_usage=True, # Enable memory optimization
216
+ offload_folder="offload", # Enable offloading if needed
217
+ offload_state_dict=True, # Offload state dict for memory saving
218
+ max_memory={0: f"{self.memory_limit_gb}GB"} # Limit memory usage
219
+ )
220
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
221
+ except Exception as e:
222
+ print(f"Model loading error: {e}")
223
+ return {"error": f"Failed to load model: {str(e)}"}
224
+
225
+ # Update progress
226
+ with self.progress_lock:
227
+ self.progress = 30 # Determining task type
228
+
229
+ # Determine task type based on dataset features
230
+ task_type = self._determine_task_type(dataset)
231
+
232
+ # Update progress
233
+ with self.progress_lock:
234
+ self.progress = 40 # Starting evaluation
235
+
236
+ try:
237
+ # Run appropriate evaluation based on task type
238
+ if task_type == "text-generation":
239
+ results = self._evaluate_text_generation(model, tokenizer, dataset)
240
+ elif task_type == "question-answering":
241
+ results = self._evaluate_question_answering(model, tokenizer, dataset)
242
+ elif task_type == "classification":
243
+ results = self._evaluate_classification(model, tokenizer, dataset)
244
+ elif task_type == "code-generation":
245
+ results = self._evaluate_code_generation(model, tokenizer, dataset)
246
+ else:
247
+ # Default to general evaluation
248
+ results = self._evaluate_general(model, tokenizer, dataset)
249
+ except Exception as e:
250
+ print(f"Evaluation task error: {e}")
251
+ return {"error": f"Evaluation failed: {str(e)}"}
252
+
253
+ # Update progress
254
+ with self.progress_lock:
255
+ self.progress = 95 # Cleaning up
256
+
257
+ # Clean up to free memory
258
+ del model
259
+ del tokenizer
260
+ if torch.cuda.is_available():
261
+ torch.cuda.empty_cache()
262
+
263
+ # Update progress
264
+ with self.progress_lock:
265
+ self.progress = 100 # Completed
266
+
267
+ return results
268
+
269
+ def get_current_progress(self):
270
+ """Get the current evaluation progress.
271
+
272
+ Returns:
273
+ tuple: (current_evaluation, progress_percentage)
274
+ """
275
+ with self.progress_lock:
276
+ return self.current_evaluation, self.progress
277
+
278
+ def _determine_task_type(self, dataset):
279
+ """Determine the task type based on dataset features.
280
+
281
+ Args:
282
+ dataset: HuggingFace dataset
283
+
284
+ Returns:
285
+ str: Task type
286
+ """
287
+ features = dataset.features
288
+
289
+ # Check for common feature patterns
290
+ if "question" in features and "answer" in features:
291
+ return "question-answering"
292
+ elif "code" in features or "solution" in features:
293
+ return "code-generation"
294
+ elif "label" in features or "class" in features:
295
+ return "classification"
296
+ elif "input" in features and "output" in features:
297
+ return "text-generation"
298
+ else:
299
+ return "general"
300
+
301
+ def _evaluate_text_generation(self, model, tokenizer, dataset):
302
+ """Evaluate a model on text generation tasks.
303
+
304
+ Args:
305
+ model: HuggingFace model
306
+ tokenizer: HuggingFace tokenizer
307
+ dataset: HuggingFace dataset
308
+
309
+ Returns:
310
+ dict: Evaluation results
311
+ """
312
+ # Set up generation pipeline
313
+ generator = pipeline(
314
+ "text-generation",
315
+ model=model,
316
+ tokenizer=tokenizer,
317
+ device="cpu"
318
+ )
319
+
320
+ # Sample a subset for evaluation (to keep runtime reasonable)
321
+ if len(dataset) > 100:
322
+ dataset = dataset.select(range(100))
323
+
324
+ # Track metrics
325
+ correct = 0
326
+ total = 0
327
+ generated_texts = []
328
+
329
+ # Process each example
330
+ for i, example in enumerate(dataset):
331
+ # Update progress based on completion percentage
332
+ with self.progress_lock:
333
+ self.progress = 40 + int((i / len(dataset)) * 50)
334
+
335
+ input_text = example.get("input", example.get("prompt", ""))
336
+ expected_output = example.get("output", example.get("target", ""))
337
+
338
+ if not input_text or not expected_output:
339
+ continue
340
+
341
+ # Generate text
342
+ generated = generator(
343
+ input_text,
344
+ max_length=100,
345
+ num_return_sequences=1
346
+ )
347
+
348
+ generated_text = generated[0]["generated_text"]
349
+ generated_texts.append(generated_text)
350
+
351
+ # Simple exact match check
352
+ if expected_output.strip() in generated_text:
353
+ correct += 1
354
+
355
+ total += 1
356
+
357
+ # Calculate metrics
358
+ accuracy = correct / total if total > 0 else 0
359
+
360
+ return {
361
+ "accuracy": accuracy,
362
+ "samples_evaluated": total,
363
+ "generated_samples": generated_texts[:5] # Include a few samples
364
+ }
365
+
366
+ def _evaluate_question_answering(self, model, tokenizer, dataset):
367
+ """Evaluate a model on question answering tasks.
368
+
369
+ Args:
370
+ model: HuggingFace model
371
+ tokenizer: HuggingFace tokenizer
372
+ dataset: HuggingFace dataset
373
+
374
+ Returns:
375
+ dict: Evaluation results
376
+ """
377
+ # Set up QA pipeline
378
+ qa_pipeline = pipeline(
379
+ "question-answering",
380
+ model=model,
381
+ tokenizer=tokenizer,
382
+ device="cpu"
383
+ )
384
+
385
+ # Sample a subset for evaluation
386
+ if len(dataset) > 100:
387
+ dataset = dataset.select(range(100))
388
+
389
+ # Track metrics
390
+ exact_matches = 0
391
+ f1_scores = []
392
+ total = 0
393
+
394
+ # Process each example
395
+ for i, example in enumerate(dataset):
396
+ # Update progress based on completion percentage
397
+ with self.progress_lock:
398
+ self.progress = 40 + int((i / len(dataset)) * 50)
399
+
400
+ question = example.get("question", "")
401
+ context = example.get("context", "")
402
+ answer = example.get("answer", "")
403
+
404
+ if not question or not answer:
405
+ continue
406
+
407
+ # Get model prediction
408
+ if context:
409
+ result = qa_pipeline(question=question, context=context)
410
+ else:
411
+ # If no context provided, use the question as context
412
+ result = qa_pipeline(question=question, context=question)
413
+
414
+ predicted_answer = result["answer"]
415
+
416
+ # Calculate exact match
417
+ if predicted_answer.strip() == answer.strip():
418
+ exact_matches += 1
419
+
420
+ # Calculate F1 score
421
+ f1 = self._calculate_f1(answer, predicted_answer)
422
+ f1_scores.append(f1)
423
+
424
+ total += 1
425
+
426
+ # Calculate metrics
427
+ exact_match_accuracy = exact_matches / total if total > 0 else 0
428
+ avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
429
+
430
+ return {
431
+ "exact_match": exact_match_accuracy,
432
+ "f1": avg_f1,
433
+ "samples_evaluated": total
434
+ }
435
+
436
+ def _evaluate_classification(self, model, tokenizer, dataset):
437
+ """Evaluate a model on classification tasks.
438
+
439
+ Args:
440
+ model: HuggingFace model
441
+ tokenizer: HuggingFace tokenizer
442
+ dataset: HuggingFace dataset
443
+
444
+ Returns:
445
+ dict: Evaluation results
446
+ """
447
+ # Set up classification pipeline
448
+ classifier = pipeline(
449
+ "text-classification",
450
+ model=model,
451
+ tokenizer=tokenizer,
452
+ device="cpu"
453
+ )
454
+
455
+ # Sample a subset for evaluation
456
+ if len(dataset) > 100:
457
+ dataset = dataset.select(range(100))
458
+
459
+ # Track metrics
460
+ correct = 0
461
+ total = 0
462
+
463
+ # Process each example
464
+ for i, example in enumerate(dataset):
465
+ # Update progress based on completion percentage
466
+ with self.progress_lock:
467
+ self.progress = 40 + int((i / len(dataset)) * 50)
468
+
469
+ text = example.get("text", example.get("sentence", ""))
470
+ label = str(example.get("label", example.get("class", "")))
471
+
472
+ if not text or not label:
473
+ continue
474
+
475
+ # Get model prediction
476
+ result = classifier(text)
477
+ predicted_label = result[0]["label"]
478
+
479
+ # Check if correct
480
+ if str(predicted_label) == label:
481
+ correct += 1
482
+
483
+ total += 1
484
+
485
+ # Calculate metrics
486
+ accuracy = correct / total if total > 0 else 0
487
+
488
+ return {
489
+ "accuracy": accuracy,
490
+ "samples_evaluated": total
491
+ }
492
+
493
+ def _evaluate_code_generation(self, model, tokenizer, dataset):
494
+ """Evaluate a model on code generation tasks.
495
+
496
+ Args:
497
+ model: HuggingFace model
498
+ tokenizer: HuggingFace tokenizer
499
+ dataset: HuggingFace dataset
500
+
501
+ Returns:
502
+ dict: Evaluation results
503
+ """
504
+ # Set up generation pipeline
505
+ generator = pipeline(
506
+ "text-generation",
507
+ model=model,
508
+ tokenizer=tokenizer,
509
+ device="cpu"
510
+ )
511
+
512
+ # Sample a subset for evaluation
513
+ if len(dataset) > 50: # Smaller sample for code tasks
514
+ dataset = dataset.select(range(50))
515
+
516
+ # Track metrics
517
+ exact_matches = 0
518
+ functional_matches = 0
519
+ total = 0
520
+
521
+ # Process each example
522
+ for i, example in enumerate(dataset):
523
+ # Update progress based on completion percentage
524
+ with self.progress_lock:
525
+ self.progress = 40 + int((i / len(dataset)) * 50)
526
+
527
+ prompt = example.get("prompt", example.get("input", ""))
528
+ solution = example.get("solution", example.get("output", ""))
529
+
530
+ if not prompt or not solution:
531
+ continue
532
+
533
+ # Generate code
534
+ generated = generator(
535
+ prompt,
536
+ max_length=200,
537
+ num_return_sequences=1
538
+ )
539
+
540
+ generated_code = generated[0]["generated_text"]
541
+
542
+ # Extract code from generated text (remove prompt)
543
+ if prompt in generated_code:
544
+ generated_code = generated_code[len(prompt):].strip()
545
+
546
+ # Check exact match
547
+ if generated_code.strip() == solution.strip():
548
+ exact_matches += 1
549
+ functional_matches += 1
550
+ else:
551
+ # We would ideally check functional correctness here
552
+ # but that requires executing code which is complex and potentially unsafe
553
+ # For now, we'll use a simple heuristic
554
+ if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
555
+ functional_matches += 0.5 # Partial credit
556
+
557
+ total += 1
558
+
559
+ # Calculate metrics
560
+ exact_match_rate = exact_matches / total if total > 0 else 0
561
+ functional_correctness = functional_matches / total if total > 0 else 0
562
+
563
+ return {
564
+ "exact_match": exact_match_rate,
565
+ "functional_correctness": functional_correctness,
566
+ "samples_evaluated": total
567
+ }
568
+
569
+ def _evaluate_general(self, model, tokenizer, dataset):
570
+ """General evaluation for any dataset type.
571
+
572
+ Args:
573
+ model: HuggingFace model
574
+ tokenizer: HuggingFace tokenizer
575
+ dataset: HuggingFace dataset
576
+
577
+ Returns:
578
+ dict: Evaluation results
579
+ """
580
+ # Set up generation pipeline
581
+ generator = pipeline(
582
+ "text-generation",
583
+ model=model,
584
+ tokenizer=tokenizer,
585
+ device="cpu"
586
+ )
587
+
588
+ # Sample a subset for evaluation
589
+ if len(dataset) > 50:
590
+ dataset = dataset.select(range(50))
591
+
592
+ # Find input and output fields
593
+ features = dataset.features
594
+ input_field = None
595
+ output_field = None
596
+
597
+ for field in features:
598
+ if field.lower() in ["input", "prompt", "question", "text"]:
599
+ input_field = field
600
+ elif field.lower() in ["output", "target", "answer", "response"]:
601
+ output_field = field
602
+
603
+ if not input_field:
604
+ # Just use the first string field as input
605
+ for field in features:
606
+ if isinstance(features[field], (str, list)):
607
+ input_field = field
608
+ break
609
+
610
+ # Track metrics
611
+ total = 0
612
+ generated_texts = []
613
+
614
+ # Process each example
615
+ for i, example in enumerate(dataset):
616
+ # Update progress based on completion percentage
617
+ with self.progress_lock:
618
+ self.progress = 40 + int((i / len(dataset)) * 50)
619
+
620
+ if input_field and input_field in example:
621
+ input_text = str(example[input_field])
622
+
623
+ # Generate text
624
+ generated = generator(
625
+ input_text,
626
+ max_length=100,
627
+ num_return_sequences=1
628
+ )
629
+
630
+ generated_text = generated[0]["generated_text"]
631
+ generated_texts.append({
632
+ "input": input_text,
633
+ "output": generated_text,
634
+ "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
635
+ })
636
+
637
+ total += 1
638
+
639
+ return {
640
+ "samples_evaluated": total,
641
+ "generated_samples": generated_texts[:5] # Include a few samples
642
+ }
643
+
644
+ def _calculate_f1(self, answer, prediction):
645
+ """Calculate F1 score between answer and prediction.
646
+
647
+ Args:
648
+ answer: Ground truth answer
649
+ prediction: Model prediction
650
+
651
+ Returns:
652
+ float: F1 score
653
+ """
654
+ # Tokenize
655
+ answer_tokens = answer.lower().split()
656
+ prediction_tokens = prediction.lower().split()
657
+
658
+ # Calculate precision and recall
659
+ common_tokens = set(answer_tokens) & set(prediction_tokens)
660
+
661
+ if not common_tokens:
662
+ return 0.0
663
+
664
+ precision = len(common_tokens) / len(prediction_tokens)
665
+ recall = len(common_tokens) / len(answer_tokens)
666
+
667
+ # Calculate F1
668
+ if precision + recall == 0:
669
+ return 0.0
670
+
671
+ f1 = 2 * precision * recall / (precision + recall)
672
+ return f1
673
+
674
+ def _calculate_overall_score(self, results):
675
+ """Calculate an overall score from evaluation results.
676
+
677
+ Args:
678
+ results: Evaluation results dictionary
679
+
680
+ Returns:
681
+ float: Overall score between 0 and 100
682
+ """
683
+ # If there was an error, return a low score
684
+ if "error" in results:
685
+ return 0.0
686
+
687
+ score = 0.0
688
+
689
+ # Check for common metrics and weight them
690
+ if "accuracy" in results:
691
+ score += results["accuracy"] * 100
692
+
693
+ if "exact_match" in results:
694
+ score += results["exact_match"] * 100
695
+
696
+ if "f1" in results:
697
+ score += results["f1"] * 100
698
+
699
+ if "functional_correctness" in results:
700
+ score += results["functional_correctness"] * 100
701
+
702
+ # If multiple metrics were found, average them
703
+ num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
704
+
705
+ if num_metrics > 0:
706
+ score /= num_metrics
707
+ else:
708
+ # Default score if no metrics available
709
+ score = 50.0
710
+
711
+ return score
712
+
713
+ def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
714
+ """Submit a model for evaluation on a benchmark.
715
+
716
+ Args:
717
+ model_id: Model ID in the database
718
+ benchmark_id: Benchmark ID in the database
719
+ user_id: User ID submitting the evaluation
720
+ priority: Queue priority (higher = higher priority)
721
+
722
+ Returns:
723
+ tuple: (evaluation_id, message)
724
+ """
725
+ # Check if user can submit today
726
+ if not self.auth_manager.can_submit_benchmark(user_id):
727
+ return None, "Daily submission limit reached. Try again tomorrow."
728
+
729
+ try:
730
+ # Get model HuggingFace ID to check size
731
+ model_info = self.db_manager.get_model(model_id)
732
+ if not model_info:
733
+ return None, "Model not found in database."
734
+
735
+ # Check if model will fit in memory
736
+ will_fit, message = self.check_model_size(model_info['hf_model_id'])
737
+
738
+ if not will_fit:
739
+ return None, message
740
+
741
+ # Add evaluation to database and queue
742
+ evaluation_id = self.db_manager.add_evaluation(
743
+ model_id=model_id,
744
+ benchmark_id=benchmark_id,
745
+ priority=priority
746
+ )
747
+
748
+ # Update user's last submission date
749
+ self.auth_manager.update_submission_date(user_id)
750
+
751
+ # Make sure worker is running
752
+ self.start_worker()
753
+
754
+ return evaluation_id, f"Evaluation submitted successfully. {message}"
755
+ except Exception as e:
756
+ print(f"Submit evaluation error: {e}")
757
+ return None, f"Failed to submit evaluation: {str(e)}"
758
+
759
+ def get_queue_status(self):
760
+ """Get the current status of the evaluation queue.
761
+
762
+ Returns:
763
+ dict: Queue status information
764
+ """
765
+ try:
766
+ # Get evaluations from database
767
+ pending_evals = self.db_manager.get_evaluation_results(status="pending")
768
+ running_evals = self.db_manager.get_evaluation_results(status="running")
769
+ completed_evals = self.db_manager.get_evaluation_results(status="completed")
770
+ failed_evals = self.db_manager.get_evaluation_results(status="failed")
771
+
772
+ # Get current evaluation progress
773
+ current_eval, progress = self.get_current_progress()
774
+
775
+ return {
776
+ "pending": len(pending_evals),
777
+ "running": len(running_evals),
778
+ "completed": len(completed_evals),
779
+ "failed": len(failed_evals),
780
+ "is_processing": self.is_processing,
781
+ "current_evaluation": current_eval,
782
+ "progress": progress,
783
+ "memory_limit_gb": self.memory_limit_gb
784
+ }
785
+ except Exception as e:
786
+ print(f"Queue status error: {e}")
787
+ return {
788
+ "pending": 0,
789
+ "running": 0,
790
+ "completed": 0,
791
+ "failed": 0,
792
+ "is_processing": self.is_processing,
793
+ "current_evaluation": None,
794
+ "progress": 0,
795
+ "memory_limit_gb": self.memory_limit_gb,
796
+ "error": str(e)
797
+ }
798
+
799
+ # Model submission UI components
800
  def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
801
  """Create the model submission UI components.
802
 
 
901
 
902
  def refresh_benchmarks_handler():
903
  benchmarks = db_manager.get_benchmarks()
904
+
905
  # Format for dropdown - properly formatted to display names
906
  choices = []
907
  for b in benchmarks: