Quazim0t0 commited on
Commit
3582217
·
verified ·
1 Parent(s): c73c631

Update evaluation_queue.py

Browse files
Files changed (1) hide show
  1. evaluation_queue.py +17 -799
evaluation_queue.py CHANGED
@@ -1,802 +1,8 @@
1
  """
2
- Model evaluation queue system for Dynamic Highscores.
3
-
4
- This module handles the evaluation queue, CPU-only processing,
5
- and enforces daily submission limits for users.
6
  """
7
 
8
- import os
9
- import json
10
- import time
11
- import threading
12
- import queue as queue_module
13
- from datetime import datetime, timedelta
14
- import gradio as gr
15
- from huggingface_hub import HfApi, hf_hub_download, snapshot_download, model_info
16
- from datasets import load_dataset
17
- import torch
18
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
- import sqlite3
20
-
21
- class EvaluationQueue:
22
- """Manages the evaluation queue for model benchmarking."""
23
-
24
- def __init__(self, db_manager, auth_manager):
25
- """Initialize the evaluation queue manager.
26
-
27
- Args:
28
- db_manager: Database manager instance
29
- auth_manager: Authentication manager instance
30
- """
31
- self.db_manager = db_manager
32
- self.auth_manager = auth_manager
33
- self.hf_api = HfApi()
34
- self.queue = queue_module.Queue()
35
- self.is_processing = False
36
- self.worker_thread = None
37
- self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
38
- self.current_evaluation = None
39
- self.progress = 0
40
- self.progress_lock = threading.Lock()
41
- # Memory limit for models in GB (leave 2GB for system)
42
- self.memory_limit_gb = 14.0
43
-
44
- def start_worker(self):
45
- """Start the worker thread for processing the evaluation queue."""
46
- if self.worker_thread is None or not self.worker_thread.is_alive():
47
- self.is_processing = True
48
- self.worker_thread = threading.Thread(target=self._process_queue)
49
- self.worker_thread.daemon = True
50
- self.worker_thread.start()
51
-
52
- def stop_worker(self):
53
- """Stop the worker thread."""
54
- self.is_processing = False
55
- if self.worker_thread and self.worker_thread.is_alive():
56
- self.worker_thread.join(timeout=1.0)
57
-
58
- def check_model_size(self, model_id):
59
- """Check if a model will fit within RAM limitations.
60
-
61
- Args:
62
- model_id: HuggingFace model ID
63
-
64
- Returns:
65
- tuple: (will_fit, message)
66
- """
67
- try:
68
- # Query model info from the HuggingFace API
69
- model_info_obj = self.hf_api.model_info(model_id)
70
-
71
- # Check if model size information is available
72
- if hasattr(model_info_obj, 'safetensors') and model_info_obj.safetensors:
73
- # Calculate size in GB (divided by 1024^3)
74
- total_size_gb = sum(
75
- file.size for file in model_info_obj.safetensors.values()
76
- ) / (1024 * 1024 * 1024)
77
- elif hasattr(model_info_obj, 'siblings'):
78
- # Legacy method - calculate from file siblings
79
- total_size_gb = sum(
80
- sibling.size for sibling in model_info_obj.siblings
81
- if sibling.rfilename.endswith(('.bin', '.safetensors', '.pt'))
82
- ) / (1024 * 1024 * 1024)
83
- else:
84
- # Can't determine size
85
- return False, "Unable to determine model size. Please ensure model is under 14GB."
86
-
87
- # Account for memory overhead (tokenizer, processing, etc.)
88
- estimated_ram_needed = total_size_gb * 1.3 # 30% overhead
89
-
90
- # Check against limit
91
- if estimated_ram_needed > self.memory_limit_gb:
92
- return False, f"Model is too large (approximately {total_size_gb:.1f}GB, needs {estimated_ram_needed:.1f}GB RAM). Maximum allowed is {self.memory_limit_gb}GB."
93
-
94
- return True, f"Model size check passed ({total_size_gb:.1f}GB, estimated {estimated_ram_needed:.1f}GB RAM usage)"
95
-
96
- except Exception as e:
97
- print(f"Model size check error: {e}")
98
- # If we can't check, be cautious
99
- return False, f"Error checking model size: {str(e)}. Please ensure your model is under {self.memory_limit_gb}GB."
100
-
101
- def _process_queue(self):
102
- """Process the evaluation queue in a separate thread."""
103
- while self.is_processing:
104
- try:
105
- # Get the next evaluation from the database
106
- pending_evals = self.db_manager.get_evaluation_results(status="pending")
107
-
108
- if pending_evals:
109
- # Sort by priority and added_at
110
- next_eval = pending_evals[0]
111
-
112
- # Update status to running
113
- self.db_manager.update_evaluation_status(next_eval['id'], 'running')
114
-
115
- # Set current evaluation and reset progress
116
- with self.progress_lock:
117
- self.current_evaluation = next_eval
118
- self.progress = 0
119
-
120
- try:
121
- # Get model and benchmark details
122
- model_info = self.db_manager.get_model(next_eval['model_id'])
123
- benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
124
-
125
- if model_info and benchmark_info:
126
- # Check if model will fit in memory
127
- will_fit, message = self.check_model_size(model_info['hf_model_id'])
128
-
129
- if not will_fit:
130
- raise Exception(f"Model too large for evaluation: {message}")
131
-
132
- # Run the evaluation
133
- results = self._run_evaluation(
134
- model_info['hf_model_id'],
135
- benchmark_info['dataset_id']
136
- )
137
-
138
- # Calculate overall score
139
- score = self._calculate_overall_score(results)
140
-
141
- # Update status to completed with results
142
- self.db_manager.update_evaluation_status(
143
- next_eval['id'],
144
- 'completed',
145
- results=results,
146
- score=score
147
- )
148
- else:
149
- raise Exception("Model or benchmark not found")
150
- except Exception as e:
151
- print(f"Evaluation error: {e}")
152
- # Update status to failed with error message
153
- error_results = {"error": str(e)}
154
- self.db_manager.update_evaluation_status(
155
- next_eval['id'],
156
- 'failed',
157
- results=error_results
158
- )
159
-
160
- # Clear current evaluation
161
- with self.progress_lock:
162
- self.current_evaluation = None
163
- self.progress = 0
164
- else:
165
- # No evaluations in queue, sleep for a bit
166
- time.sleep(5)
167
- except Exception as e:
168
- print(f"Queue processing error: {e}")
169
- time.sleep(5)
170
-
171
- def _run_evaluation(self, model_id, dataset_id):
172
- """Run an evaluation for a model on a benchmark.
173
-
174
- Args:
175
- model_id: HuggingFace model ID
176
- dataset_id: HuggingFace dataset ID (with optional config)
177
-
178
- Returns:
179
- dict: Evaluation results
180
- """
181
- # Update progress
182
- with self.progress_lock:
183
- self.progress = 5 # Starting evaluation
184
-
185
- # Parse dataset ID and config
186
- if ":" in dataset_id:
187
- dataset_id, config = dataset_id.split(":", 1)
188
- else:
189
- config = None
190
-
191
- # Update progress
192
- with self.progress_lock:
193
- self.progress = 10 # Loading dataset
194
-
195
- # Load the dataset
196
- try:
197
- if config:
198
- dataset = load_dataset(dataset_id, config, split="test")
199
- else:
200
- dataset = load_dataset(dataset_id, split="test")
201
- except Exception as e:
202
- return {"error": f"Failed to load dataset: {str(e)}"}
203
-
204
- # Update progress
205
- with self.progress_lock:
206
- self.progress = 20 # Loading model
207
-
208
- try:
209
- # Load the model with memory optimization settings
210
- device = "cpu"
211
- model = AutoModelForCausalLM.from_pretrained(
212
- model_id,
213
- device_map=device,
214
- torch_dtype=torch.float32, # Use float32 for CPU
215
- low_cpu_mem_usage=True, # Enable memory optimization
216
- offload_folder="offload", # Enable offloading if needed
217
- offload_state_dict=True, # Offload state dict for memory saving
218
- max_memory={0: f"{self.memory_limit_gb}GB"} # Limit memory usage
219
- )
220
- tokenizer = AutoTokenizer.from_pretrained(model_id)
221
- except Exception as e:
222
- print(f"Model loading error: {e}")
223
- return {"error": f"Failed to load model: {str(e)}"}
224
-
225
- # Update progress
226
- with self.progress_lock:
227
- self.progress = 30 # Determining task type
228
-
229
- # Determine task type based on dataset features
230
- task_type = self._determine_task_type(dataset)
231
-
232
- # Update progress
233
- with self.progress_lock:
234
- self.progress = 40 # Starting evaluation
235
-
236
- try:
237
- # Run appropriate evaluation based on task type
238
- if task_type == "text-generation":
239
- results = self._evaluate_text_generation(model, tokenizer, dataset)
240
- elif task_type == "question-answering":
241
- results = self._evaluate_question_answering(model, tokenizer, dataset)
242
- elif task_type == "classification":
243
- results = self._evaluate_classification(model, tokenizer, dataset)
244
- elif task_type == "code-generation":
245
- results = self._evaluate_code_generation(model, tokenizer, dataset)
246
- else:
247
- # Default to general evaluation
248
- results = self._evaluate_general(model, tokenizer, dataset)
249
- except Exception as e:
250
- print(f"Evaluation task error: {e}")
251
- return {"error": f"Evaluation failed: {str(e)}"}
252
-
253
- # Update progress
254
- with self.progress_lock:
255
- self.progress = 95 # Cleaning up
256
-
257
- # Clean up to free memory
258
- del model
259
- del tokenizer
260
- if torch.cuda.is_available():
261
- torch.cuda.empty_cache()
262
-
263
- # Update progress
264
- with self.progress_lock:
265
- self.progress = 100 # Completed
266
-
267
- return results
268
-
269
- def get_current_progress(self):
270
- """Get the current evaluation progress.
271
-
272
- Returns:
273
- tuple: (current_evaluation, progress_percentage)
274
- """
275
- with self.progress_lock:
276
- return self.current_evaluation, self.progress
277
-
278
- def _determine_task_type(self, dataset):
279
- """Determine the task type based on dataset features.
280
-
281
- Args:
282
- dataset: HuggingFace dataset
283
-
284
- Returns:
285
- str: Task type
286
- """
287
- features = dataset.features
288
-
289
- # Check for common feature patterns
290
- if "question" in features and "answer" in features:
291
- return "question-answering"
292
- elif "code" in features or "solution" in features:
293
- return "code-generation"
294
- elif "label" in features or "class" in features:
295
- return "classification"
296
- elif "input" in features and "output" in features:
297
- return "text-generation"
298
- else:
299
- return "general"
300
-
301
- def _evaluate_text_generation(self, model, tokenizer, dataset):
302
- """Evaluate a model on text generation tasks.
303
-
304
- Args:
305
- model: HuggingFace model
306
- tokenizer: HuggingFace tokenizer
307
- dataset: HuggingFace dataset
308
-
309
- Returns:
310
- dict: Evaluation results
311
- """
312
- # Set up generation pipeline
313
- generator = pipeline(
314
- "text-generation",
315
- model=model,
316
- tokenizer=tokenizer,
317
- device="cpu"
318
- )
319
-
320
- # Sample a subset for evaluation (to keep runtime reasonable)
321
- if len(dataset) > 100:
322
- dataset = dataset.select(range(100))
323
-
324
- # Track metrics
325
- correct = 0
326
- total = 0
327
- generated_texts = []
328
-
329
- # Process each example
330
- for i, example in enumerate(dataset):
331
- # Update progress based on completion percentage
332
- with self.progress_lock:
333
- self.progress = 40 + int((i / len(dataset)) * 50)
334
-
335
- input_text = example.get("input", example.get("prompt", ""))
336
- expected_output = example.get("output", example.get("target", ""))
337
-
338
- if not input_text or not expected_output:
339
- continue
340
-
341
- # Generate text
342
- generated = generator(
343
- input_text,
344
- max_length=100,
345
- num_return_sequences=1
346
- )
347
-
348
- generated_text = generated[0]["generated_text"]
349
- generated_texts.append(generated_text)
350
-
351
- # Simple exact match check
352
- if expected_output.strip() in generated_text:
353
- correct += 1
354
-
355
- total += 1
356
-
357
- # Calculate metrics
358
- accuracy = correct / total if total > 0 else 0
359
-
360
- return {
361
- "accuracy": accuracy,
362
- "samples_evaluated": total,
363
- "generated_samples": generated_texts[:5] # Include a few samples
364
- }
365
-
366
- def _evaluate_question_answering(self, model, tokenizer, dataset):
367
- """Evaluate a model on question answering tasks.
368
-
369
- Args:
370
- model: HuggingFace model
371
- tokenizer: HuggingFace tokenizer
372
- dataset: HuggingFace dataset
373
-
374
- Returns:
375
- dict: Evaluation results
376
- """
377
- # Set up QA pipeline
378
- qa_pipeline = pipeline(
379
- "question-answering",
380
- model=model,
381
- tokenizer=tokenizer,
382
- device="cpu"
383
- )
384
-
385
- # Sample a subset for evaluation
386
- if len(dataset) > 100:
387
- dataset = dataset.select(range(100))
388
-
389
- # Track metrics
390
- exact_matches = 0
391
- f1_scores = []
392
- total = 0
393
-
394
- # Process each example
395
- for i, example in enumerate(dataset):
396
- # Update progress based on completion percentage
397
- with self.progress_lock:
398
- self.progress = 40 + int((i / len(dataset)) * 50)
399
-
400
- question = example.get("question", "")
401
- context = example.get("context", "")
402
- answer = example.get("answer", "")
403
-
404
- if not question or not answer:
405
- continue
406
-
407
- # Get model prediction
408
- if context:
409
- result = qa_pipeline(question=question, context=context)
410
- else:
411
- # If no context provided, use the question as context
412
- result = qa_pipeline(question=question, context=question)
413
-
414
- predicted_answer = result["answer"]
415
-
416
- # Calculate exact match
417
- if predicted_answer.strip() == answer.strip():
418
- exact_matches += 1
419
-
420
- # Calculate F1 score
421
- f1 = self._calculate_f1(answer, predicted_answer)
422
- f1_scores.append(f1)
423
-
424
- total += 1
425
-
426
- # Calculate metrics
427
- exact_match_accuracy = exact_matches / total if total > 0 else 0
428
- avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
429
-
430
- return {
431
- "exact_match": exact_match_accuracy,
432
- "f1": avg_f1,
433
- "samples_evaluated": total
434
- }
435
-
436
- def _evaluate_classification(self, model, tokenizer, dataset):
437
- """Evaluate a model on classification tasks.
438
-
439
- Args:
440
- model: HuggingFace model
441
- tokenizer: HuggingFace tokenizer
442
- dataset: HuggingFace dataset
443
-
444
- Returns:
445
- dict: Evaluation results
446
- """
447
- # Set up classification pipeline
448
- classifier = pipeline(
449
- "text-classification",
450
- model=model,
451
- tokenizer=tokenizer,
452
- device="cpu"
453
- )
454
-
455
- # Sample a subset for evaluation
456
- if len(dataset) > 100:
457
- dataset = dataset.select(range(100))
458
-
459
- # Track metrics
460
- correct = 0
461
- total = 0
462
-
463
- # Process each example
464
- for i, example in enumerate(dataset):
465
- # Update progress based on completion percentage
466
- with self.progress_lock:
467
- self.progress = 40 + int((i / len(dataset)) * 50)
468
-
469
- text = example.get("text", example.get("sentence", ""))
470
- label = str(example.get("label", example.get("class", "")))
471
-
472
- if not text or not label:
473
- continue
474
-
475
- # Get model prediction
476
- result = classifier(text)
477
- predicted_label = result[0]["label"]
478
-
479
- # Check if correct
480
- if str(predicted_label) == label:
481
- correct += 1
482
-
483
- total += 1
484
-
485
- # Calculate metrics
486
- accuracy = correct / total if total > 0 else 0
487
-
488
- return {
489
- "accuracy": accuracy,
490
- "samples_evaluated": total
491
- }
492
-
493
- def _evaluate_code_generation(self, model, tokenizer, dataset):
494
- """Evaluate a model on code generation tasks.
495
-
496
- Args:
497
- model: HuggingFace model
498
- tokenizer: HuggingFace tokenizer
499
- dataset: HuggingFace dataset
500
-
501
- Returns:
502
- dict: Evaluation results
503
- """
504
- # Set up generation pipeline
505
- generator = pipeline(
506
- "text-generation",
507
- model=model,
508
- tokenizer=tokenizer,
509
- device="cpu"
510
- )
511
-
512
- # Sample a subset for evaluation
513
- if len(dataset) > 50: # Smaller sample for code tasks
514
- dataset = dataset.select(range(50))
515
-
516
- # Track metrics
517
- exact_matches = 0
518
- functional_matches = 0
519
- total = 0
520
-
521
- # Process each example
522
- for i, example in enumerate(dataset):
523
- # Update progress based on completion percentage
524
- with self.progress_lock:
525
- self.progress = 40 + int((i / len(dataset)) * 50)
526
-
527
- prompt = example.get("prompt", example.get("input", ""))
528
- solution = example.get("solution", example.get("output", ""))
529
-
530
- if not prompt or not solution:
531
- continue
532
-
533
- # Generate code
534
- generated = generator(
535
- prompt,
536
- max_length=200,
537
- num_return_sequences=1
538
- )
539
-
540
- generated_code = generated[0]["generated_text"]
541
-
542
- # Extract code from generated text (remove prompt)
543
- if prompt in generated_code:
544
- generated_code = generated_code[len(prompt):].strip()
545
-
546
- # Check exact match
547
- if generated_code.strip() == solution.strip():
548
- exact_matches += 1
549
- functional_matches += 1
550
- else:
551
- # We would ideally check functional correctness here
552
- # but that requires executing code which is complex and potentially unsafe
553
- # For now, we'll use a simple heuristic
554
- if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
555
- functional_matches += 0.5 # Partial credit
556
-
557
- total += 1
558
-
559
- # Calculate metrics
560
- exact_match_rate = exact_matches / total if total > 0 else 0
561
- functional_correctness = functional_matches / total if total > 0 else 0
562
-
563
- return {
564
- "exact_match": exact_match_rate,
565
- "functional_correctness": functional_correctness,
566
- "samples_evaluated": total
567
- }
568
-
569
- def _evaluate_general(self, model, tokenizer, dataset):
570
- """General evaluation for any dataset type.
571
-
572
- Args:
573
- model: HuggingFace model
574
- tokenizer: HuggingFace tokenizer
575
- dataset: HuggingFace dataset
576
-
577
- Returns:
578
- dict: Evaluation results
579
- """
580
- # Set up generation pipeline
581
- generator = pipeline(
582
- "text-generation",
583
- model=model,
584
- tokenizer=tokenizer,
585
- device="cpu"
586
- )
587
-
588
- # Sample a subset for evaluation
589
- if len(dataset) > 50:
590
- dataset = dataset.select(range(50))
591
-
592
- # Find input and output fields
593
- features = dataset.features
594
- input_field = None
595
- output_field = None
596
-
597
- for field in features:
598
- if field.lower() in ["input", "prompt", "question", "text"]:
599
- input_field = field
600
- elif field.lower() in ["output", "target", "answer", "response"]:
601
- output_field = field
602
-
603
- if not input_field:
604
- # Just use the first string field as input
605
- for field in features:
606
- if isinstance(features[field], (str, list)):
607
- input_field = field
608
- break
609
-
610
- # Track metrics
611
- total = 0
612
- generated_texts = []
613
-
614
- # Process each example
615
- for i, example in enumerate(dataset):
616
- # Update progress based on completion percentage
617
- with self.progress_lock:
618
- self.progress = 40 + int((i / len(dataset)) * 50)
619
-
620
- if input_field and input_field in example:
621
- input_text = str(example[input_field])
622
-
623
- # Generate text
624
- generated = generator(
625
- input_text,
626
- max_length=100,
627
- num_return_sequences=1
628
- )
629
-
630
- generated_text = generated[0]["generated_text"]
631
- generated_texts.append({
632
- "input": input_text,
633
- "output": generated_text,
634
- "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
635
- })
636
-
637
- total += 1
638
-
639
- return {
640
- "samples_evaluated": total,
641
- "generated_samples": generated_texts[:5] # Include a few samples
642
- }
643
-
644
- def _calculate_f1(self, answer, prediction):
645
- """Calculate F1 score between answer and prediction.
646
-
647
- Args:
648
- answer: Ground truth answer
649
- prediction: Model prediction
650
-
651
- Returns:
652
- float: F1 score
653
- """
654
- # Tokenize
655
- answer_tokens = answer.lower().split()
656
- prediction_tokens = prediction.lower().split()
657
-
658
- # Calculate precision and recall
659
- common_tokens = set(answer_tokens) & set(prediction_tokens)
660
-
661
- if not common_tokens:
662
- return 0.0
663
-
664
- precision = len(common_tokens) / len(prediction_tokens)
665
- recall = len(common_tokens) / len(answer_tokens)
666
-
667
- # Calculate F1
668
- if precision + recall == 0:
669
- return 0.0
670
-
671
- f1 = 2 * precision * recall / (precision + recall)
672
- return f1
673
-
674
- def _calculate_overall_score(self, results):
675
- """Calculate an overall score from evaluation results.
676
-
677
- Args:
678
- results: Evaluation results dictionary
679
-
680
- Returns:
681
- float: Overall score between 0 and 100
682
- """
683
- # If there was an error, return a low score
684
- if "error" in results:
685
- return 0.0
686
-
687
- score = 0.0
688
-
689
- # Check for common metrics and weight them
690
- if "accuracy" in results:
691
- score += results["accuracy"] * 100
692
-
693
- if "exact_match" in results:
694
- score += results["exact_match"] * 100
695
-
696
- if "f1" in results:
697
- score += results["f1"] * 100
698
-
699
- if "functional_correctness" in results:
700
- score += results["functional_correctness"] * 100
701
-
702
- # If multiple metrics were found, average them
703
- num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
704
-
705
- if num_metrics > 0:
706
- score /= num_metrics
707
- else:
708
- # Default score if no metrics available
709
- score = 50.0
710
-
711
- return score
712
-
713
- def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
714
- """Submit a model for evaluation on a benchmark.
715
-
716
- Args:
717
- model_id: Model ID in the database
718
- benchmark_id: Benchmark ID in the database
719
- user_id: User ID submitting the evaluation
720
- priority: Queue priority (higher = higher priority)
721
-
722
- Returns:
723
- tuple: (evaluation_id, message)
724
- """
725
- # Check if user can submit today
726
- if not self.auth_manager.can_submit_benchmark(user_id):
727
- return None, "Daily submission limit reached. Try again tomorrow."
728
-
729
- try:
730
- # Get model HuggingFace ID to check size
731
- model_info = self.db_manager.get_model(model_id)
732
- if not model_info:
733
- return None, "Model not found in database."
734
-
735
- # Check if model will fit in memory
736
- will_fit, message = self.check_model_size(model_info['hf_model_id'])
737
-
738
- if not will_fit:
739
- return None, message
740
-
741
- # Add evaluation to database and queue
742
- evaluation_id = self.db_manager.add_evaluation(
743
- model_id=model_id,
744
- benchmark_id=benchmark_id,
745
- priority=priority
746
- )
747
-
748
- # Update user's last submission date
749
- self.auth_manager.update_submission_date(user_id)
750
-
751
- # Make sure worker is running
752
- self.start_worker()
753
-
754
- return evaluation_id, f"Evaluation submitted successfully. {message}"
755
- except Exception as e:
756
- print(f"Submit evaluation error: {e}")
757
- return None, f"Failed to submit evaluation: {str(e)}"
758
-
759
- def get_queue_status(self):
760
- """Get the current status of the evaluation queue.
761
-
762
- Returns:
763
- dict: Queue status information
764
- """
765
- try:
766
- # Get evaluations from database
767
- pending_evals = self.db_manager.get_evaluation_results(status="pending")
768
- running_evals = self.db_manager.get_evaluation_results(status="running")
769
- completed_evals = self.db_manager.get_evaluation_results(status="completed")
770
- failed_evals = self.db_manager.get_evaluation_results(status="failed")
771
-
772
- # Get current evaluation progress
773
- current_eval, progress = self.get_current_progress()
774
-
775
- return {
776
- "pending": len(pending_evals),
777
- "running": len(running_evals),
778
- "completed": len(completed_evals),
779
- "failed": len(failed_evals),
780
- "is_processing": self.is_processing,
781
- "current_evaluation": current_eval,
782
- "progress": progress,
783
- "memory_limit_gb": self.memory_limit_gb
784
- }
785
- except Exception as e:
786
- print(f"Queue status error: {e}")
787
- return {
788
- "pending": 0,
789
- "running": 0,
790
- "completed": 0,
791
- "failed": 0,
792
- "is_processing": self.is_processing,
793
- "current_evaluation": None,
794
- "progress": 0,
795
- "memory_limit_gb": self.memory_limit_gb,
796
- "error": str(e)
797
- }
798
-
799
- # Model submission UI components
800
  def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
801
  """Create the model submission UI components.
802
 
@@ -850,9 +56,12 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
850
  info="Select one category that best describes your model"
851
  )
852
 
 
853
  benchmark_dropdown = gr.Dropdown(
854
  label="Benchmark",
855
- info="Select a benchmark to evaluate your model on"
 
 
856
  )
857
 
858
  refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
@@ -899,8 +108,14 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
899
  def refresh_benchmarks_handler():
900
  benchmarks = db_manager.get_benchmarks()
901
 
902
- # Format for dropdown
903
- choices = [(str(b["id"]), b["name"]) for b in benchmarks]
 
 
 
 
 
 
904
 
905
  return gr.update(choices=choices)
906
 
@@ -914,6 +129,9 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
914
  if not model_id or not model_name or not model_tag or not benchmark_id:
915
  return "Please fill in all required fields."
916
 
 
 
 
917
  try:
918
  # Check if model will fit in RAM
919
  will_fit, size_message = evaluation_queue.check_model_size(model_id)
 
1
  """
2
+ Updated create_model_submission_ui function that properly displays benchmark names in dropdown.
3
+ Replace this function in your evaluation_queue.py file.
 
 
4
  """
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
7
  """Create the model submission UI components.
8
 
 
56
  info="Select one category that best describes your model"
57
  )
58
 
59
+ # Fixed benchmark dropdown to properly show names
60
  benchmark_dropdown = gr.Dropdown(
61
  label="Benchmark",
62
+ info="Select a benchmark to evaluate your model on",
63
+ choices=[("none", "Loading benchmarks...")],
64
+ value=None
65
  )
66
 
67
  refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
 
108
  def refresh_benchmarks_handler():
109
  benchmarks = db_manager.get_benchmarks()
110
 
111
+ # Format for dropdown - properly formatted to display names
112
+ choices = []
113
+ for b in benchmarks:
114
+ # Add as tuple of (id, name) to ensure proper display
115
+ choices.append((str(b["id"]), b["name"]))
116
+
117
+ if not choices:
118
+ choices = [("none", "No benchmarks available - add some first")]
119
 
120
  return gr.update(choices=choices)
121
 
 
129
  if not model_id or not model_name or not model_tag or not benchmark_id:
130
  return "Please fill in all required fields."
131
 
132
+ if benchmark_id == "none":
133
+ return "Please select a valid benchmark."
134
+
135
  try:
136
  # Check if model will fit in RAM
137
  will_fit, size_message = evaluation_queue.check_model_size(model_id)