Update evaluation_queue.py
Browse files- evaluation_queue.py +39 -87
evaluation_queue.py
CHANGED
@@ -9,7 +9,7 @@ import os
|
|
9 |
import json
|
10 |
import time
|
11 |
import threading
|
12 |
-
import queue
|
13 |
from datetime import datetime, timedelta
|
14 |
import gradio as gr
|
15 |
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
@@ -31,14 +31,13 @@ class EvaluationQueue:
|
|
31 |
self.db_manager = db_manager
|
32 |
self.auth_manager = auth_manager
|
33 |
self.hf_api = HfApi()
|
34 |
-
self.queue =
|
35 |
self.is_processing = False
|
36 |
self.worker_thread = None
|
37 |
self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
|
38 |
self.current_evaluation = None
|
39 |
self.progress = 0
|
40 |
self.progress_lock = threading.Lock()
|
41 |
-
self.db_path = db_manager.db_path # Store the path to create new connections in worker thread
|
42 |
|
43 |
def start_worker(self):
|
44 |
"""Start the worker thread for processing the evaluation queue."""
|
@@ -56,36 +55,17 @@ class EvaluationQueue:
|
|
56 |
|
57 |
def _process_queue(self):
|
58 |
"""Process the evaluation queue in a separate thread."""
|
59 |
-
# Create a new database connection for this thread
|
60 |
-
thread_db = sqlite3.connect(self.db_path)
|
61 |
-
thread_db.row_factory = sqlite3.Row
|
62 |
-
|
63 |
while self.is_processing:
|
64 |
try:
|
65 |
-
# Get the next evaluation from the database
|
66 |
-
|
67 |
-
cursor.execute("""
|
68 |
-
SELECT e.id as evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
|
69 |
-
FROM queue q
|
70 |
-
JOIN evaluations e ON q.evaluation_id = e.id
|
71 |
-
JOIN models m ON e.model_id = m.id
|
72 |
-
JOIN benchmarks b ON e.benchmark_id = b.id
|
73 |
-
WHERE e.status = 'pending'
|
74 |
-
ORDER BY q.priority DESC, q.added_at ASC
|
75 |
-
LIMIT 1
|
76 |
-
""")
|
77 |
-
row = cursor.fetchone()
|
78 |
|
79 |
-
if
|
80 |
-
|
|
|
81 |
|
82 |
# Update status to running
|
83 |
-
|
84 |
-
UPDATE evaluations
|
85 |
-
SET status = 'running', started_at = datetime('now')
|
86 |
-
WHERE id = ?
|
87 |
-
""", (next_eval['evaluation_id'],))
|
88 |
-
thread_db.commit()
|
89 |
|
90 |
# Set current evaluation and reset progress
|
91 |
with self.progress_lock:
|
@@ -93,34 +73,33 @@ class EvaluationQueue:
|
|
93 |
self.progress = 0
|
94 |
|
95 |
try:
|
96 |
-
#
|
97 |
-
|
98 |
-
|
99 |
-
next_eval['dataset_id']
|
100 |
-
)
|
101 |
-
|
102 |
-
# Calculate overall score
|
103 |
-
score = self._calculate_overall_score(results)
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
except Exception as e:
|
116 |
print(f"Evaluation error: {e}")
|
117 |
# Update status to failed
|
118 |
-
|
119 |
-
UPDATE evaluations
|
120 |
-
SET status = 'failed', completed_at = datetime('now')
|
121 |
-
WHERE id = ?
|
122 |
-
""", (next_eval['evaluation_id'],))
|
123 |
-
thread_db.commit()
|
124 |
|
125 |
# Clear current evaluation
|
126 |
with self.progress_lock:
|
@@ -132,9 +111,6 @@ class EvaluationQueue:
|
|
132 |
except Exception as e:
|
133 |
print(f"Queue processing error: {e}")
|
134 |
time.sleep(5)
|
135 |
-
|
136 |
-
# Close the thread-local database connection
|
137 |
-
thread_db.close()
|
138 |
|
139 |
def _run_evaluation(self, model_id, dataset_id):
|
140 |
"""Run an evaluation for a model on a benchmark.
|
@@ -211,7 +187,8 @@ class EvaluationQueue:
|
|
211 |
# Clean up to free memory
|
212 |
del model
|
213 |
del tokenizer
|
214 |
-
torch.cuda.
|
|
|
215 |
|
216 |
# Update progress
|
217 |
with self.progress_lock:
|
@@ -418,7 +395,8 @@ class EvaluationQueue:
|
|
418 |
# Update progress based on completion percentage
|
419 |
with self.progress_lock:
|
420 |
self.progress = 40 + int((i / len(dataset)) * 50)
|
421 |
-
|
|
|
422 |
text = example.get("text", example.get("sentence", ""))
|
423 |
label = str(example.get("label", example.get("class", "")))
|
424 |
|
@@ -669,7 +647,7 @@ class EvaluationQueue:
|
|
669 |
priority: Queue priority (higher = higher priority)
|
670 |
|
671 |
Returns:
|
672 |
-
|
673 |
"""
|
674 |
# Check if user can submit today
|
675 |
if not self.auth_manager.can_submit_benchmark(user_id):
|
@@ -806,28 +784,13 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
806 |
current_eval_info = gr.Markdown("No evaluation currently running")
|
807 |
# Use a simple text display for progress instead of Progress component
|
808 |
progress_display = gr.Markdown("Progress: 0%")
|
809 |
-
|
810 |
-
# Function to update progress display
|
811 |
-
def update_progress_display():
|
812 |
-
current_eval, progress = evaluation_queue.get_current_progress()
|
813 |
-
|
814 |
-
if current_eval:
|
815 |
-
model_info = db_manager.get_model(current_eval['model_id'])
|
816 |
-
benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
|
817 |
-
|
818 |
-
if model_info and benchmark_info:
|
819 |
-
eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
|
820 |
-
progress_text = f"Progress: {progress}%"
|
821 |
-
return eval_info, progress_text
|
822 |
-
|
823 |
-
return "No evaluation currently running", "Progress: 0%"
|
824 |
|
825 |
# Event handlers
|
826 |
def refresh_benchmarks_handler():
|
827 |
benchmarks = db_manager.get_benchmarks()
|
828 |
|
829 |
# Format for dropdown
|
830 |
-
choices = [(b["id"], b["name"]) for b in benchmarks]
|
831 |
|
832 |
return gr.update(choices=choices)
|
833 |
|
@@ -873,7 +836,7 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
873 |
# Get queue statistics
|
874 |
stats = evaluation_queue.get_queue_status()
|
875 |
|
876 |
-
# Get recent evaluations
|
877 |
evals = db_manager.get_evaluation_results(limit=20)
|
878 |
|
879 |
# Format for dataframe
|
@@ -932,16 +895,5 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
932 |
inputs=[],
|
933 |
outputs=[benchmark_dropdown]
|
934 |
)
|
935 |
-
|
936 |
-
submission_ui.load(
|
937 |
-
fn=refresh_queue_handler,
|
938 |
-
inputs=[],
|
939 |
-
outputs=[queue_stats, queue_status, current_eval_info, progress_display]
|
940 |
-
)
|
941 |
-
|
942 |
-
# Manual refresh button with instructions
|
943 |
-
gr.Markdown("""
|
944 |
-
**Note:** Click the 'Refresh Queue' button periodically to update the progress display.
|
945 |
-
""")
|
946 |
|
947 |
-
return submission_ui
|
|
|
9 |
import json
|
10 |
import time
|
11 |
import threading
|
12 |
+
import queue as queue_module
|
13 |
from datetime import datetime, timedelta
|
14 |
import gradio as gr
|
15 |
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
|
|
31 |
self.db_manager = db_manager
|
32 |
self.auth_manager = auth_manager
|
33 |
self.hf_api = HfApi()
|
34 |
+
self.queue = queue_module.Queue()
|
35 |
self.is_processing = False
|
36 |
self.worker_thread = None
|
37 |
self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
|
38 |
self.current_evaluation = None
|
39 |
self.progress = 0
|
40 |
self.progress_lock = threading.Lock()
|
|
|
41 |
|
42 |
def start_worker(self):
|
43 |
"""Start the worker thread for processing the evaluation queue."""
|
|
|
55 |
|
56 |
def _process_queue(self):
|
57 |
"""Process the evaluation queue in a separate thread."""
|
|
|
|
|
|
|
|
|
58 |
while self.is_processing:
|
59 |
try:
|
60 |
+
# Get the next evaluation from the database
|
61 |
+
pending_evals = self.db_manager.get_evaluation_results(status="pending")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
+
if pending_evals:
|
64 |
+
# Sort by priority and added_at
|
65 |
+
next_eval = pending_evals[0]
|
66 |
|
67 |
# Update status to running
|
68 |
+
self.db_manager.update_evaluation_status(next_eval['id'], 'running')
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
# Set current evaluation and reset progress
|
71 |
with self.progress_lock:
|
|
|
73 |
self.progress = 0
|
74 |
|
75 |
try:
|
76 |
+
# Get model and benchmark details
|
77 |
+
model_info = self.db_manager.get_model(next_eval['model_id'])
|
78 |
+
benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
+
if model_info and benchmark_info:
|
81 |
+
# Run the evaluation
|
82 |
+
results = self._run_evaluation(
|
83 |
+
model_info['hf_model_id'],
|
84 |
+
benchmark_info['dataset_id']
|
85 |
+
)
|
86 |
+
|
87 |
+
# Calculate overall score
|
88 |
+
score = self._calculate_overall_score(results)
|
89 |
+
|
90 |
+
# Update status to completed with results
|
91 |
+
self.db_manager.update_evaluation_status(
|
92 |
+
next_eval['id'],
|
93 |
+
'completed',
|
94 |
+
results=results,
|
95 |
+
score=score
|
96 |
+
)
|
97 |
+
else:
|
98 |
+
raise Exception("Model or benchmark not found")
|
99 |
except Exception as e:
|
100 |
print(f"Evaluation error: {e}")
|
101 |
# Update status to failed
|
102 |
+
self.db_manager.update_evaluation_status(next_eval['id'], 'failed')
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
# Clear current evaluation
|
105 |
with self.progress_lock:
|
|
|
111 |
except Exception as e:
|
112 |
print(f"Queue processing error: {e}")
|
113 |
time.sleep(5)
|
|
|
|
|
|
|
114 |
|
115 |
def _run_evaluation(self, model_id, dataset_id):
|
116 |
"""Run an evaluation for a model on a benchmark.
|
|
|
187 |
# Clean up to free memory
|
188 |
del model
|
189 |
del tokenizer
|
190 |
+
if torch.cuda.is_available():
|
191 |
+
torch.cuda.empty_cache()
|
192 |
|
193 |
# Update progress
|
194 |
with self.progress_lock:
|
|
|
395 |
# Update progress based on completion percentage
|
396 |
with self.progress_lock:
|
397 |
self.progress = 40 + int((i / len(dataset)) * 50)
|
398 |
+
|
399 |
+
|
400 |
text = example.get("text", example.get("sentence", ""))
|
401 |
label = str(example.get("label", example.get("class", "")))
|
402 |
|
|
|
647 |
priority: Queue priority (higher = higher priority)
|
648 |
|
649 |
Returns:
|
650 |
+
tuple: (evaluation_id, message)
|
651 |
"""
|
652 |
# Check if user can submit today
|
653 |
if not self.auth_manager.can_submit_benchmark(user_id):
|
|
|
784 |
current_eval_info = gr.Markdown("No evaluation currently running")
|
785 |
# Use a simple text display for progress instead of Progress component
|
786 |
progress_display = gr.Markdown("Progress: 0%")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
787 |
|
788 |
# Event handlers
|
789 |
def refresh_benchmarks_handler():
|
790 |
benchmarks = db_manager.get_benchmarks()
|
791 |
|
792 |
# Format for dropdown
|
793 |
+
choices = [(str(b["id"]), b["name"]) for b in benchmarks]
|
794 |
|
795 |
return gr.update(choices=choices)
|
796 |
|
|
|
836 |
# Get queue statistics
|
837 |
stats = evaluation_queue.get_queue_status()
|
838 |
|
839 |
+
# Get recent evaluations (all statuses, limited to 20)
|
840 |
evals = db_manager.get_evaluation_results(limit=20)
|
841 |
|
842 |
# Format for dataframe
|
|
|
895 |
inputs=[],
|
896 |
outputs=[benchmark_dropdown]
|
897 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
898 |
|
899 |
+
return submission_ui
|