Quazim0t0 commited on
Commit
7c42b46
·
verified ·
1 Parent(s): b9e27dc

Delete evaluation_queue.py

Browse files
Files changed (1) hide show
  1. evaluation_queue.py +0 -964
evaluation_queue.py DELETED
@@ -1,964 +0,0 @@
1
- """
2
- Model evaluation queue system for Dynamic Highscores.
3
-
4
- This module handles the evaluation queue, CPU-only processing,
5
- and enforces daily submission limits for users.
6
- """
7
-
8
- import os
9
- import json
10
- import time
11
- import threading
12
- import queue
13
- from datetime import datetime, timedelta
14
- import gradio as gr
15
- from huggingface_hub import HfApi, hf_hub_download, snapshot_download
16
- from datasets import load_dataset
17
- import torch
18
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
- import sqlite3
20
-
21
- class EvaluationQueue:
22
- """Manages the evaluation queue for model benchmarking."""
23
-
24
- def __init__(self, db_manager, auth_manager):
25
- """Initialize the evaluation queue manager.
26
-
27
- Args:
28
- db_manager: Database manager instance
29
- auth_manager: Authentication manager instance
30
- """
31
- self.db_manager = db_manager
32
- self.auth_manager = auth_manager
33
- self.hf_api = HfApi()
34
- self.queue = queue.Queue()
35
- self.is_processing = False
36
- self.worker_thread = None
37
- self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
38
- self.current_evaluation = None
39
- self.progress = 0
40
- self.progress_lock = threading.Lock()
41
- self.db_path = db_manager.db_path # Store the path to create new connections in worker thread
42
-
43
- def start_worker(self):
44
- """Start the worker thread for processing the evaluation queue."""
45
- if self.worker_thread is None or not self.worker_thread.is_alive():
46
- self.is_processing = True
47
- self.worker_thread = threading.Thread(target=self._process_queue)
48
- self.worker_thread.daemon = True
49
- self.worker_thread.start()
50
-
51
- def stop_worker(self):
52
- """Stop the worker thread."""
53
- self.is_processing = False
54
- if self.worker_thread and self.worker_thread.is_alive():
55
- self.worker_thread.join(timeout=1.0)
56
-
57
- def _process_queue(self):
58
- """Process the evaluation queue in a separate thread."""
59
- # Create a new database connection for this thread
60
- thread_db = sqlite3.connect(self.db_path)
61
- thread_db.row_factory = sqlite3.Row
62
-
63
- while self.is_processing:
64
- try:
65
- # Get the next evaluation from the database using thread-local connection
66
- cursor = thread_db.cursor()
67
- cursor.execute("""
68
- SELECT e.id as evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
69
- FROM queue q
70
- JOIN evaluations e ON q.evaluation_id = e.id
71
- JOIN models m ON e.model_id = m.id
72
- JOIN benchmarks b ON e.benchmark_id = b.id
73
- WHERE e.status = 'pending'
74
- ORDER BY q.priority DESC, q.created_at ASC
75
- LIMIT 1
76
- """)
77
- row = cursor.fetchone()
78
-
79
- if row:
80
- next_eval = dict(row)
81
-
82
- # Update status to running
83
- cursor.execute("""
84
- UPDATE evaluations
85
- SET status = 'running', started_at = datetime('now')
86
- WHERE id = ?
87
- """, (next_eval['evaluation_id'],))
88
- thread_db.commit()
89
-
90
- # Set current evaluation and reset progress
91
- with self.progress_lock:
92
- self.current_evaluation = next_eval
93
- self.progress = 0
94
-
95
- try:
96
- # Run the evaluation
97
- results = self._run_evaluation(
98
- next_eval['hf_model_id'],
99
- next_eval['dataset_id']
100
- )
101
-
102
- # Calculate overall score
103
- score = self._calculate_overall_score(results)
104
-
105
- # Update status to completed with results
106
- cursor.execute("""
107
- UPDATE evaluations
108
- SET status = 'completed',
109
- completed_at = datetime('now'),
110
- results = ?,
111
- score = ?
112
- WHERE id = ?
113
- """, (json.dumps(results), score, next_eval['evaluation_id']))
114
- thread_db.commit()
115
- except Exception as e:
116
- print(f"Evaluation error: {e}")
117
- # Update status to failed
118
- cursor.execute("""
119
- UPDATE evaluations
120
- SET status = 'failed', completed_at = datetime('now')
121
- WHERE id = ?
122
- """, (next_eval['evaluation_id'],))
123
- thread_db.commit()
124
-
125
- # Clear current evaluation
126
- with self.progress_lock:
127
- self.current_evaluation = None
128
- self.progress = 0
129
- else:
130
- # No evaluations in queue, sleep for a bit
131
- time.sleep(5)
132
- except Exception as e:
133
- print(f"Queue processing error: {e}")
134
- time.sleep(5)
135
-
136
- # Close the thread-local database connection
137
- thread_db.close()
138
-
139
- def _run_evaluation(self, model_id, dataset_id):
140
- """Run an evaluation for a model on a benchmark.
141
-
142
- Args:
143
- model_id: HuggingFace model ID
144
- dataset_id: HuggingFace dataset ID (with optional config)
145
-
146
- Returns:
147
- dict: Evaluation results
148
- """
149
- # Update progress
150
- with self.progress_lock:
151
- self.progress = 5 # Starting evaluation
152
-
153
- # Parse dataset ID and config
154
- if ":" in dataset_id:
155
- dataset_id, config = dataset_id.split(":", 1)
156
- else:
157
- config = None
158
-
159
- # Update progress
160
- with self.progress_lock:
161
- self.progress = 10 # Loading dataset
162
-
163
- # Load the dataset
164
- if config:
165
- dataset = load_dataset(dataset_id, config, split="test")
166
- else:
167
- dataset = load_dataset(dataset_id, split="test")
168
-
169
- # Update progress
170
- with self.progress_lock:
171
- self.progress = 20 # Loading model
172
-
173
- # Load the model (CPU only)
174
- device = "cpu"
175
- model = AutoModelForCausalLM.from_pretrained(
176
- model_id,
177
- device_map=device,
178
- torch_dtype=torch.float32, # Use float32 for CPU
179
- low_cpu_mem_usage=True
180
- )
181
- tokenizer = AutoTokenizer.from_pretrained(model_id)
182
-
183
- # Update progress
184
- with self.progress_lock:
185
- self.progress = 30 # Determining task type
186
-
187
- # Determine task type based on dataset features
188
- task_type = self._determine_task_type(dataset)
189
-
190
- # Update progress
191
- with self.progress_lock:
192
- self.progress = 40 # Starting evaluation
193
-
194
- # Run appropriate evaluation based on task type
195
- if task_type == "text-generation":
196
- results = self._evaluate_text_generation(model, tokenizer, dataset)
197
- elif task_type == "question-answering":
198
- results = self._evaluate_question_answering(model, tokenizer, dataset)
199
- elif task_type == "classification":
200
- results = self._evaluate_classification(model, tokenizer, dataset)
201
- elif task_type == "code-generation":
202
- results = self._evaluate_code_generation(model, tokenizer, dataset)
203
- else:
204
- # Default to general evaluation
205
- results = self._evaluate_general(model, tokenizer, dataset)
206
-
207
- # Update progress
208
- with self.progress_lock:
209
- self.progress = 95 # Cleaning up
210
-
211
- # Clean up to free memory
212
- del model
213
- del tokenizer
214
- torch.cuda.empty_cache()
215
-
216
- # Update progress
217
- with self.progress_lock:
218
- self.progress = 100 # Completed
219
-
220
- return results
221
-
222
- def get_current_progress(self):
223
- """Get the current evaluation progress.
224
-
225
- Returns:
226
- tuple: (current_evaluation, progress_percentage)
227
- """
228
- with self.progress_lock:
229
- return self.current_evaluation, self.progress
230
-
231
- def _determine_task_type(self, dataset):
232
- """Determine the task type based on dataset features.
233
-
234
- Args:
235
- dataset: HuggingFace dataset
236
-
237
- Returns:
238
- str: Task type
239
- """
240
- features = dataset.features
241
-
242
- # Check for common feature patterns
243
- if "question" in features and "answer" in features:
244
- return "question-answering"
245
- elif "code" in features or "solution" in features:
246
- return "code-generation"
247
- elif "label" in features or "class" in features:
248
- return "classification"
249
- elif "input" in features and "output" in features:
250
- return "text-generation"
251
- else:
252
- return "general"
253
-
254
- def _evaluate_text_generation(self, model, tokenizer, dataset):
255
- """Evaluate a model on text generation tasks.
256
-
257
- Args:
258
- model: HuggingFace model
259
- tokenizer: HuggingFace tokenizer
260
- dataset: HuggingFace dataset
261
-
262
- Returns:
263
- dict: Evaluation results
264
- """
265
- # Set up generation pipeline
266
- generator = pipeline(
267
- "text-generation",
268
- model=model,
269
- tokenizer=tokenizer,
270
- device="cpu"
271
- )
272
-
273
- # Sample a subset for evaluation (to keep runtime reasonable)
274
- if len(dataset) > 100:
275
- dataset = dataset.select(range(100))
276
-
277
- # Track metrics
278
- correct = 0
279
- total = 0
280
- generated_texts = []
281
-
282
- # Process each example
283
- for i, example in enumerate(dataset):
284
- # Update progress based on completion percentage
285
- with self.progress_lock:
286
- self.progress = 40 + int((i / len(dataset)) * 50)
287
-
288
- input_text = example.get("input", example.get("prompt", ""))
289
- expected_output = example.get("output", example.get("target", ""))
290
-
291
- if not input_text or not expected_output:
292
- continue
293
-
294
- # Generate text
295
- generated = generator(
296
- input_text,
297
- max_length=100,
298
- num_return_sequences=1
299
- )
300
-
301
- generated_text = generated[0]["generated_text"]
302
- generated_texts.append(generated_text)
303
-
304
- # Simple exact match check
305
- if expected_output.strip() in generated_text:
306
- correct += 1
307
-
308
- total += 1
309
-
310
- # Calculate metrics
311
- accuracy = correct / total if total > 0 else 0
312
-
313
- return {
314
- "accuracy": accuracy,
315
- "samples_evaluated": total,
316
- "generated_samples": generated_texts[:5] # Include a few samples
317
- }
318
-
319
- def _evaluate_question_answering(self, model, tokenizer, dataset):
320
- """Evaluate a model on question answering tasks.
321
-
322
- Args:
323
- model: HuggingFace model
324
- tokenizer: HuggingFace tokenizer
325
- dataset: HuggingFace dataset
326
-
327
- Returns:
328
- dict: Evaluation results
329
- """
330
- # Set up QA pipeline
331
- qa_pipeline = pipeline(
332
- "question-answering",
333
- model=model,
334
- tokenizer=tokenizer,
335
- device="cpu"
336
- )
337
-
338
- # Sample a subset for evaluation
339
- if len(dataset) > 100:
340
- dataset = dataset.select(range(100))
341
-
342
- # Track metrics
343
- exact_matches = 0
344
- f1_scores = []
345
- total = 0
346
-
347
- # Process each example
348
- for i, example in enumerate(dataset):
349
- # Update progress based on completion percentage
350
- with self.progress_lock:
351
- self.progress = 40 + int((i / len(dataset)) * 50)
352
-
353
- question = example.get("question", "")
354
- context = example.get("context", "")
355
- answer = example.get("answer", "")
356
-
357
- if not question or not answer:
358
- continue
359
-
360
- # Get model prediction
361
- if context:
362
- result = qa_pipeline(question=question, context=context)
363
- else:
364
- # If no context provided, use the question as context
365
- result = qa_pipeline(question=question, context=question)
366
-
367
- predicted_answer = result["answer"]
368
-
369
- # Calculate exact match
370
- if predicted_answer.strip() == answer.strip():
371
- exact_matches += 1
372
-
373
- # Calculate F1 score
374
- f1 = self._calculate_f1(answer, predicted_answer)
375
- f1_scores.append(f1)
376
-
377
- total += 1
378
-
379
- # Calculate metrics
380
- exact_match_accuracy = exact_matches / total if total > 0 else 0
381
- avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
382
-
383
- return {
384
- "exact_match": exact_match_accuracy,
385
- "f1": avg_f1,
386
- "samples_evaluated": total
387
- }
388
-
389
- def _evaluate_classification(self, model, tokenizer, dataset):
390
- """Evaluate a model on classification tasks.
391
-
392
- Args:
393
- model: HuggingFace model
394
- tokenizer: HuggingFace tokenizer
395
- dataset: HuggingFace dataset
396
-
397
- Returns:
398
- dict: Evaluation results
399
- """
400
- # Set up classification pipeline
401
- classifier = pipeline(
402
- "text-classification",
403
- model=model,
404
- tokenizer=tokenizer,
405
- device="cpu"
406
- )
407
-
408
- # Sample a subset for evaluation
409
- if len(dataset) > 100:
410
- dataset = dataset.select(range(100))
411
-
412
- # Track metrics
413
- correct = 0
414
- total = 0
415
-
416
- # Process each example
417
- for i, example in enumerate(dataset):
418
- # Update progress based on completion percentage
419
- with self.progress_lock:
420
- self.progress = 40 + int((i / len(dataset)) * 50)
421
-
422
- text = example.get("text", example.get("sentence", ""))
423
- label = str(example.get("label", example.get("class", "")))
424
-
425
- if not text or not label:
426
- continue
427
-
428
- # Get model prediction
429
- result = classifier(text)
430
- predicted_label = result[0]["label"]
431
-
432
- # Check if correct
433
- if str(predicted_label) == label:
434
- correct += 1
435
-
436
- total += 1
437
-
438
- # Calculate metrics
439
- accuracy = correct / total if total > 0 else 0
440
-
441
- return {
442
- "accuracy": accuracy,
443
- "samples_evaluated": total
444
- }
445
-
446
- def _evaluate_code_generation(self, model, tokenizer, dataset):
447
- """Evaluate a model on code generation tasks.
448
-
449
- Args:
450
- model: HuggingFace model
451
- tokenizer: HuggingFace tokenizer
452
- dataset: HuggingFace dataset
453
-
454
- Returns:
455
- dict: Evaluation results
456
- """
457
- # Set up generation pipeline
458
- generator = pipeline(
459
- "text-generation",
460
- model=model,
461
- tokenizer=tokenizer,
462
- device="cpu"
463
- )
464
-
465
- # Sample a subset for evaluation
466
- if len(dataset) > 50: # Smaller sample for code tasks
467
- dataset = dataset.select(range(50))
468
-
469
- # Track metrics
470
- exact_matches = 0
471
- functional_matches = 0
472
- total = 0
473
-
474
- # Process each example
475
- for i, example in enumerate(dataset):
476
- # Update progress based on completion percentage
477
- with self.progress_lock:
478
- self.progress = 40 + int((i / len(dataset)) * 50)
479
-
480
- prompt = example.get("prompt", example.get("input", ""))
481
- solution = example.get("solution", example.get("output", ""))
482
-
483
- if not prompt or not solution:
484
- continue
485
-
486
- # Generate code
487
- generated = generator(
488
- prompt,
489
- max_length=200,
490
- num_return_sequences=1
491
- )
492
-
493
- generated_code = generated[0]["generated_text"]
494
-
495
- # Extract code from generated text (remove prompt)
496
- if prompt in generated_code:
497
- generated_code = generated_code[len(prompt):].strip()
498
-
499
- # Check exact match
500
- if generated_code.strip() == solution.strip():
501
- exact_matches += 1
502
- functional_matches += 1
503
- else:
504
- # We would ideally check functional correctness here
505
- # but that requires executing code which is complex and potentially unsafe
506
- # For now, we'll use a simple heuristic
507
- if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
508
- functional_matches += 0.5 # Partial credit
509
-
510
- total += 1
511
-
512
- # Calculate metrics
513
- exact_match_rate = exact_matches / total if total > 0 else 0
514
- functional_correctness = functional_matches / total if total > 0 else 0
515
-
516
- return {
517
- "exact_match": exact_match_rate,
518
- "functional_correctness": functional_correctness,
519
- "samples_evaluated": total
520
- }
521
-
522
- def _evaluate_general(self, model, tokenizer, dataset):
523
- """General evaluation for any dataset type.
524
-
525
- Args:
526
- model: HuggingFace model
527
- tokenizer: HuggingFace tokenizer
528
- dataset: HuggingFace dataset
529
-
530
- Returns:
531
- dict: Evaluation results
532
- """
533
- # Set up generation pipeline
534
- generator = pipeline(
535
- "text-generation",
536
- model=model,
537
- tokenizer=tokenizer,
538
- device="cpu"
539
- )
540
-
541
- # Sample a subset for evaluation
542
- if len(dataset) > 50:
543
- dataset = dataset.select(range(50))
544
-
545
- # Find input and output fields
546
- features = dataset.features
547
- input_field = None
548
- output_field = None
549
-
550
- for field in features:
551
- if field.lower() in ["input", "prompt", "question", "text"]:
552
- input_field = field
553
- elif field.lower() in ["output", "target", "answer", "response"]:
554
- output_field = field
555
-
556
- if not input_field:
557
- # Just use the first string field as input
558
- for field in features:
559
- if isinstance(features[field], (str, list)):
560
- input_field = field
561
- break
562
-
563
- # Track metrics
564
- total = 0
565
- generated_texts = []
566
-
567
- # Process each example
568
- for i, example in enumerate(dataset):
569
- # Update progress based on completion percentage
570
- with self.progress_lock:
571
- self.progress = 40 + int((i / len(dataset)) * 50)
572
-
573
- if input_field and input_field in example:
574
- input_text = str(example[input_field])
575
-
576
- # Generate text
577
- generated = generator(
578
- input_text,
579
- max_length=100,
580
- num_return_sequences=1
581
- )
582
-
583
- generated_text = generated[0]["generated_text"]
584
- generated_texts.append({
585
- "input": input_text,
586
- "output": generated_text,
587
- "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
588
- })
589
-
590
- total += 1
591
-
592
- return {
593
- "samples_evaluated": total,
594
- "generated_samples": generated_texts[:5] # Include a few samples
595
- }
596
-
597
- def _calculate_f1(self, answer, prediction):
598
- """Calculate F1 score between answer and prediction.
599
-
600
- Args:
601
- answer: Ground truth answer
602
- prediction: Model prediction
603
-
604
- Returns:
605
- float: F1 score
606
- """
607
- # Tokenize
608
- answer_tokens = answer.lower().split()
609
- prediction_tokens = prediction.lower().split()
610
-
611
- # Calculate precision and recall
612
- common_tokens = set(answer_tokens) & set(prediction_tokens)
613
-
614
- if not common_tokens:
615
- return 0.0
616
-
617
- precision = len(common_tokens) / len(prediction_tokens)
618
- recall = len(common_tokens) / len(answer_tokens)
619
-
620
- # Calculate F1
621
- if precision + recall == 0:
622
- return 0.0
623
-
624
- f1 = 2 * precision * recall / (precision + recall)
625
- return f1
626
-
627
- def _calculate_overall_score(self, results):
628
- """Calculate an overall score from evaluation results.
629
-
630
- Args:
631
- results: Evaluation results dictionary
632
-
633
- Returns:
634
- float: Overall score between 0 and 100
635
- """
636
- score = 0.0
637
-
638
- # Check for common metrics and weight them
639
- if "accuracy" in results:
640
- score += results["accuracy"] * 100
641
-
642
- if "exact_match" in results:
643
- score += results["exact_match"] * 100
644
-
645
- if "f1" in results:
646
- score += results["f1"] * 100
647
-
648
- if "functional_correctness" in results:
649
- score += results["functional_correctness"] * 100
650
-
651
- # If multiple metrics were found, average them
652
- num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
653
-
654
- if num_metrics > 0:
655
- score /= num_metrics
656
- else:
657
- # Default score if no metrics available
658
- score = 50.0
659
-
660
- return score
661
-
662
- def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
663
- """Submit a model for evaluation on a benchmark.
664
-
665
- Args:
666
- model_id: Model ID in the database
667
- benchmark_id: Benchmark ID in the database
668
- user_id: User ID submitting the evaluation
669
- priority: Queue priority (higher = higher priority)
670
-
671
- Returns:
672
- int: Evaluation ID if successful, None otherwise
673
- """
674
- # Check if user can submit today
675
- if not self.auth_manager.can_submit_benchmark(user_id):
676
- return None, "Daily submission limit reached. Try again tomorrow."
677
-
678
- try:
679
- # Add evaluation to database and queue
680
- evaluation_id = self.db_manager.add_evaluation(
681
- model_id=model_id,
682
- benchmark_id=benchmark_id,
683
- priority=priority
684
- )
685
-
686
- # Update user's last submission date
687
- self.auth_manager.update_submission_date(user_id)
688
-
689
- # Make sure worker is running
690
- self.start_worker()
691
-
692
- return evaluation_id, "Evaluation submitted successfully."
693
- except Exception as e:
694
- print(f"Submit evaluation error: {e}")
695
- return None, f"Failed to submit evaluation: {str(e)}"
696
-
697
- def get_queue_status(self):
698
- """Get the current status of the evaluation queue.
699
-
700
- Returns:
701
- dict: Queue status information
702
- """
703
- try:
704
- # Get evaluations from database
705
- pending_evals = self.db_manager.get_evaluation_results(status="pending")
706
- running_evals = self.db_manager.get_evaluation_results(status="running")
707
- completed_evals = self.db_manager.get_evaluation_results(status="completed")
708
- failed_evals = self.db_manager.get_evaluation_results(status="failed")
709
-
710
- # Get current evaluation progress
711
- current_eval, progress = self.get_current_progress()
712
-
713
- return {
714
- "pending": len(pending_evals),
715
- "running": len(running_evals),
716
- "completed": len(completed_evals),
717
- "failed": len(failed_evals),
718
- "is_processing": self.is_processing,
719
- "current_evaluation": current_eval,
720
- "progress": progress
721
- }
722
- except Exception as e:
723
- print(f"Queue status error: {e}")
724
- return {
725
- "pending": 0,
726
- "running": 0,
727
- "completed": 0,
728
- "failed": 0,
729
- "is_processing": self.is_processing,
730
- "current_evaluation": None,
731
- "progress": 0,
732
- "error": str(e)
733
- }
734
-
735
- # Model submission UI components
736
- def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
737
- """Create the model submission UI components.
738
-
739
- Args:
740
- evaluation_queue: Evaluation queue instance
741
- auth_manager: Authentication manager instance
742
- db_manager: Database manager instance
743
-
744
- Returns:
745
- gr.Blocks: Gradio Blocks component with model submission UI
746
- """
747
- with gr.Blocks() as submission_ui:
748
- with gr.Tab("Submit Model"):
749
- with gr.Row():
750
- with gr.Column(scale=2):
751
- model_id_input = gr.Textbox(
752
- placeholder="HuggingFace model ID (e.g., 'gpt2', 'facebook/opt-350m')",
753
- label="Model ID"
754
- )
755
-
756
- model_name_input = gr.Textbox(
757
- placeholder="Display name for your model",
758
- label="Model Name"
759
- )
760
-
761
- model_description_input = gr.Textbox(
762
- placeholder="Brief description of your model",
763
- label="Description",
764
- lines=3
765
- )
766
-
767
- model_parameters_input = gr.Number(
768
- label="Number of Parameters (billions)",
769
- precision=2
770
- )
771
-
772
- with gr.Column(scale=1):
773
- model_tag_input = gr.Dropdown(
774
- choices=evaluation_queue.model_tags,
775
- label="Model Tag",
776
- info="Select one category that best describes your model"
777
- )
778
-
779
- benchmark_dropdown = gr.Dropdown(
780
- label="Benchmark",
781
- info="Select a benchmark to evaluate your model on"
782
- )
783
-
784
- refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
785
-
786
- submit_model_button = gr.Button("Submit for Evaluation")
787
- submission_status = gr.Markdown("")
788
-
789
- with gr.Tab("Evaluation Queue"):
790
- refresh_queue_button = gr.Button("Refresh Queue")
791
-
792
- with gr.Row():
793
- with gr.Column(scale=1):
794
- queue_stats = gr.JSON(
795
- label="Queue Statistics"
796
- )
797
-
798
- with gr.Column(scale=2):
799
- queue_status = gr.Dataframe(
800
- headers=["ID", "Model", "Benchmark", "Status", "Submitted"],
801
- label="Recent Evaluations"
802
- )
803
-
804
- with gr.Row(visible=True) as progress_container:
805
- with gr.Column():
806
- current_eval_info = gr.Markdown("No evaluation currently running")
807
- # Use a simple text display for progress instead of Progress component
808
- progress_display = gr.Markdown("Progress: 0%")
809
-
810
- # Function to update progress display
811
- def update_progress_display():
812
- current_eval, progress = evaluation_queue.get_current_progress()
813
-
814
- if current_eval:
815
- model_info = db_manager.get_model(current_eval['model_id'])
816
- benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
817
-
818
- if model_info and benchmark_info:
819
- eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
820
- progress_text = f"Progress: {progress}%"
821
- return eval_info, progress_text
822
-
823
- return "No evaluation currently running", "Progress: 0%"
824
-
825
- # Event handlers
826
- def refresh_benchmarks_handler():
827
- benchmarks = db_manager.get_benchmarks()
828
-
829
- # Format for dropdown
830
- choices = [(b["id"], b["name"]) for b in benchmarks]
831
-
832
- return gr.update(choices=choices)
833
-
834
- def submit_model_handler(model_id, model_name, model_description, model_parameters, model_tag, benchmark_id, request: gr.Request):
835
- # Check if user is logged in
836
- user = auth_manager.check_login(request)
837
-
838
- if not user:
839
- return "Please log in to submit a model."
840
-
841
- if not model_id or not model_name or not model_tag or not benchmark_id:
842
- return "Please fill in all required fields."
843
-
844
- try:
845
- # Add model to database
846
- model_db_id = db_manager.add_model(
847
- name=model_name,
848
- hf_model_id=model_id,
849
- user_id=user["id"],
850
- tag=model_tag,
851
- parameters=str(model_parameters) if model_parameters else None,
852
- description=model_description
853
- )
854
-
855
- if not model_db_id:
856
- return "Failed to add model to database."
857
-
858
- # Submit for evaluation
859
- eval_id, message = evaluation_queue.submit_evaluation(
860
- model_id=model_db_id,
861
- benchmark_id=benchmark_id,
862
- user_id=user["id"]
863
- )
864
-
865
- if eval_id:
866
- return f"Model submitted successfully. Evaluation ID: {eval_id}"
867
- else:
868
- return message
869
- except Exception as e:
870
- return f"Error submitting model: {str(e)}"
871
-
872
- def refresh_queue_handler():
873
- # Get queue statistics
874
- stats = evaluation_queue.get_queue_status()
875
-
876
- # Get recent evaluations
877
- evals = db_manager.get_evaluation_results(limit=20)
878
-
879
- # Format for dataframe
880
- eval_data = []
881
- for eval in evals:
882
- eval_data.append([
883
- eval["id"],
884
- eval["model_name"],
885
- eval["benchmark_name"],
886
- eval["status"],
887
- eval["submitted_at"]
888
- ])
889
-
890
- # Also update progress display
891
- current_eval, progress = evaluation_queue.get_current_progress()
892
- if current_eval:
893
- model_info = db_manager.get_model(current_eval['model_id'])
894
- benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
895
-
896
- if model_info and benchmark_info:
897
- eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
898
- progress_text = f"Progress: {progress}%"
899
- return stats, eval_data, eval_info, progress_text
900
-
901
- return stats, eval_data, "No evaluation currently running", "Progress: 0%"
902
-
903
- # Connect event handlers
904
- refresh_benchmarks_button.click(
905
- fn=refresh_benchmarks_handler,
906
- inputs=[],
907
- outputs=[benchmark_dropdown]
908
- )
909
-
910
- submit_model_button.click(
911
- fn=submit_model_handler,
912
- inputs=[
913
- model_id_input,
914
- model_name_input,
915
- model_description_input,
916
- model_parameters_input,
917
- model_tag_input,
918
- benchmark_dropdown
919
- ],
920
- outputs=[submission_status]
921
- )
922
-
923
- refresh_queue_button.click(
924
- fn=refresh_queue_handler,
925
- inputs=[],
926
- outputs=[queue_stats, queue_status, current_eval_info, progress_display]
927
- )
928
-
929
- # Initialize on load
930
- submission_ui.load(
931
- fn=refresh_benchmarks_handler,
932
- inputs=[],
933
- outputs=[benchmark_dropdown]
934
- )
935
-
936
- submission_ui.load(
937
- fn=refresh_queue_handler,
938
- inputs=[],
939
- outputs=[queue_stats, queue_status, current_eval_info, progress_display]
940
- )
941
-
942
- # Set up auto-refresh for queue status
943
- refresh_interval = 5 # seconds
944
-
945
- # Create JavaScript for auto-refresh
946
- js = f"""
947
- function setupAutoRefresh() {{
948
- setInterval(function() {{
949
- document.getElementById("{refresh_queue_button.elem_id}").click();
950
- }}, {refresh_interval * 1000});
951
- }}
952
-
953
- if (window.setup_done) {{
954
- // Do nothing if already set up
955
- }} else {{
956
- setupAutoRefresh();
957
- window.setup_done = true;
958
- }}
959
- """
960
-
961
- # Add JavaScript to page
962
- submission_ui.load(None, None, None, _js=js)
963
-
964
- return submission_ui