Update evaluation_queue.py
Browse files- evaluation_queue.py +161 -33
evaluation_queue.py
CHANGED
@@ -12,7 +12,7 @@ import threading
|
|
12 |
import queue as queue_module
|
13 |
from datetime import datetime, timedelta
|
14 |
import gradio as gr
|
15 |
-
from huggingface_hub import HfApi, hf_hub_download, snapshot_download
|
16 |
from datasets import load_dataset
|
17 |
import torch
|
18 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
@@ -38,6 +38,8 @@ class EvaluationQueue:
|
|
38 |
self.current_evaluation = None
|
39 |
self.progress = 0
|
40 |
self.progress_lock = threading.Lock()
|
|
|
|
|
41 |
|
42 |
def start_worker(self):
|
43 |
"""Start the worker thread for processing the evaluation queue."""
|
@@ -53,6 +55,49 @@ class EvaluationQueue:
|
|
53 |
if self.worker_thread and self.worker_thread.is_alive():
|
54 |
self.worker_thread.join(timeout=1.0)
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def _process_queue(self):
|
57 |
"""Process the evaluation queue in a separate thread."""
|
58 |
while self.is_processing:
|
@@ -78,6 +123,12 @@ class EvaluationQueue:
|
|
78 |
benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
|
79 |
|
80 |
if model_info and benchmark_info:
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
# Run the evaluation
|
82 |
results = self._run_evaluation(
|
83 |
model_info['hf_model_id'],
|
@@ -98,8 +149,13 @@ class EvaluationQueue:
|
|
98 |
raise Exception("Model or benchmark not found")
|
99 |
except Exception as e:
|
100 |
print(f"Evaluation error: {e}")
|
101 |
-
# Update status to failed
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
# Clear current evaluation
|
105 |
with self.progress_lock:
|
@@ -137,24 +193,34 @@ class EvaluationQueue:
|
|
137 |
self.progress = 10 # Loading dataset
|
138 |
|
139 |
# Load the dataset
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
144 |
|
145 |
# Update progress
|
146 |
with self.progress_lock:
|
147 |
self.progress = 20 # Loading model
|
148 |
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
# Update progress
|
160 |
with self.progress_lock:
|
@@ -167,18 +233,22 @@ class EvaluationQueue:
|
|
167 |
with self.progress_lock:
|
168 |
self.progress = 40 # Starting evaluation
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
182 |
|
183 |
# Update progress
|
184 |
with self.progress_lock:
|
@@ -395,8 +465,7 @@ class EvaluationQueue:
|
|
395 |
# Update progress based on completion percentage
|
396 |
with self.progress_lock:
|
397 |
self.progress = 40 + int((i / len(dataset)) * 50)
|
398 |
-
|
399 |
-
|
400 |
text = example.get("text", example.get("sentence", ""))
|
401 |
label = str(example.get("label", example.get("class", "")))
|
402 |
|
@@ -611,6 +680,10 @@ class EvaluationQueue:
|
|
611 |
Returns:
|
612 |
float: Overall score between 0 and 100
|
613 |
"""
|
|
|
|
|
|
|
|
|
614 |
score = 0.0
|
615 |
|
616 |
# Check for common metrics and weight them
|
@@ -654,6 +727,17 @@ class EvaluationQueue:
|
|
654 |
return None, "Daily submission limit reached. Try again tomorrow."
|
655 |
|
656 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
# Add evaluation to database and queue
|
658 |
evaluation_id = self.db_manager.add_evaluation(
|
659 |
model_id=model_id,
|
@@ -667,7 +751,7 @@ class EvaluationQueue:
|
|
667 |
# Make sure worker is running
|
668 |
self.start_worker()
|
669 |
|
670 |
-
return evaluation_id, "Evaluation submitted successfully."
|
671 |
except Exception as e:
|
672 |
print(f"Submit evaluation error: {e}")
|
673 |
return None, f"Failed to submit evaluation: {str(e)}"
|
@@ -695,7 +779,8 @@ class EvaluationQueue:
|
|
695 |
"failed": len(failed_evals),
|
696 |
"is_processing": self.is_processing,
|
697 |
"current_evaluation": current_eval,
|
698 |
-
"progress": progress
|
|
|
699 |
}
|
700 |
except Exception as e:
|
701 |
print(f"Queue status error: {e}")
|
@@ -707,6 +792,7 @@ class EvaluationQueue:
|
|
707 |
"is_processing": self.is_processing,
|
708 |
"current_evaluation": None,
|
709 |
"progress": 0,
|
|
|
710 |
"error": str(e)
|
711 |
}
|
712 |
|
@@ -724,6 +810,13 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
724 |
"""
|
725 |
with gr.Blocks() as submission_ui:
|
726 |
with gr.Tab("Submit Model"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
with gr.Row():
|
728 |
with gr.Column(scale=2):
|
729 |
model_id_input = gr.Textbox(
|
@@ -731,6 +824,9 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
731 |
label="Model ID"
|
732 |
)
|
733 |
|
|
|
|
|
|
|
734 |
model_name_input = gr.Textbox(
|
735 |
placeholder="Display name for your model",
|
736 |
label="Model Name"
|
@@ -786,6 +882,20 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
786 |
progress_display = gr.Markdown("Progress: 0%")
|
787 |
|
788 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
789 |
def refresh_benchmarks_handler():
|
790 |
benchmarks = db_manager.get_benchmarks()
|
791 |
|
@@ -805,6 +915,12 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
805 |
return "Please fill in all required fields."
|
806 |
|
807 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
808 |
# Add model to database
|
809 |
model_db_id = db_manager.add_model(
|
810 |
name=model_name,
|
@@ -826,7 +942,7 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
826 |
)
|
827 |
|
828 |
if eval_id:
|
829 |
-
return f"Model submitted successfully.
|
830 |
else:
|
831 |
return message
|
832 |
except Exception as e:
|
@@ -864,6 +980,12 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
864 |
return stats, eval_data, "No evaluation currently running", "Progress: 0%"
|
865 |
|
866 |
# Connect event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
867 |
refresh_benchmarks_button.click(
|
868 |
fn=refresh_benchmarks_handler,
|
869 |
inputs=[],
|
@@ -895,5 +1017,11 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
|
|
895 |
inputs=[],
|
896 |
outputs=[benchmark_dropdown]
|
897 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
898 |
|
899 |
return submission_ui
|
|
|
12 |
import queue as queue_module
|
13 |
from datetime import datetime, timedelta
|
14 |
import gradio as gr
|
15 |
+
from huggingface_hub import HfApi, hf_hub_download, snapshot_download, model_info
|
16 |
from datasets import load_dataset
|
17 |
import torch
|
18 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
|
38 |
self.current_evaluation = None
|
39 |
self.progress = 0
|
40 |
self.progress_lock = threading.Lock()
|
41 |
+
# Memory limit for models in GB (leave 2GB for system)
|
42 |
+
self.memory_limit_gb = 14.0
|
43 |
|
44 |
def start_worker(self):
|
45 |
"""Start the worker thread for processing the evaluation queue."""
|
|
|
55 |
if self.worker_thread and self.worker_thread.is_alive():
|
56 |
self.worker_thread.join(timeout=1.0)
|
57 |
|
58 |
+
def check_model_size(self, model_id):
|
59 |
+
"""Check if a model will fit within RAM limitations.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
model_id: HuggingFace model ID
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
tuple: (will_fit, message)
|
66 |
+
"""
|
67 |
+
try:
|
68 |
+
# Query model info from the HuggingFace API
|
69 |
+
model_info_obj = self.hf_api.model_info(model_id)
|
70 |
+
|
71 |
+
# Check if model size information is available
|
72 |
+
if hasattr(model_info_obj, 'safetensors') and model_info_obj.safetensors:
|
73 |
+
# Calculate size in GB (divided by 1024^3)
|
74 |
+
total_size_gb = sum(
|
75 |
+
file.size for file in model_info_obj.safetensors.values()
|
76 |
+
) / (1024 * 1024 * 1024)
|
77 |
+
elif hasattr(model_info_obj, 'siblings'):
|
78 |
+
# Legacy method - calculate from file siblings
|
79 |
+
total_size_gb = sum(
|
80 |
+
sibling.size for sibling in model_info_obj.siblings
|
81 |
+
if sibling.rfilename.endswith(('.bin', '.safetensors', '.pt'))
|
82 |
+
) / (1024 * 1024 * 1024)
|
83 |
+
else:
|
84 |
+
# Can't determine size
|
85 |
+
return False, "Unable to determine model size. Please ensure model is under 14GB."
|
86 |
+
|
87 |
+
# Account for memory overhead (tokenizer, processing, etc.)
|
88 |
+
estimated_ram_needed = total_size_gb * 1.3 # 30% overhead
|
89 |
+
|
90 |
+
# Check against limit
|
91 |
+
if estimated_ram_needed > self.memory_limit_gb:
|
92 |
+
return False, f"Model is too large (approximately {total_size_gb:.1f}GB, needs {estimated_ram_needed:.1f}GB RAM). Maximum allowed is {self.memory_limit_gb}GB."
|
93 |
+
|
94 |
+
return True, f"Model size check passed ({total_size_gb:.1f}GB, estimated {estimated_ram_needed:.1f}GB RAM usage)"
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
print(f"Model size check error: {e}")
|
98 |
+
# If we can't check, be cautious
|
99 |
+
return False, f"Error checking model size: {str(e)}. Please ensure your model is under {self.memory_limit_gb}GB."
|
100 |
+
|
101 |
def _process_queue(self):
|
102 |
"""Process the evaluation queue in a separate thread."""
|
103 |
while self.is_processing:
|
|
|
123 |
benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
|
124 |
|
125 |
if model_info and benchmark_info:
|
126 |
+
# Check if model will fit in memory
|
127 |
+
will_fit, message = self.check_model_size(model_info['hf_model_id'])
|
128 |
+
|
129 |
+
if not will_fit:
|
130 |
+
raise Exception(f"Model too large for evaluation: {message}")
|
131 |
+
|
132 |
# Run the evaluation
|
133 |
results = self._run_evaluation(
|
134 |
model_info['hf_model_id'],
|
|
|
149 |
raise Exception("Model or benchmark not found")
|
150 |
except Exception as e:
|
151 |
print(f"Evaluation error: {e}")
|
152 |
+
# Update status to failed with error message
|
153 |
+
error_results = {"error": str(e)}
|
154 |
+
self.db_manager.update_evaluation_status(
|
155 |
+
next_eval['id'],
|
156 |
+
'failed',
|
157 |
+
results=error_results
|
158 |
+
)
|
159 |
|
160 |
# Clear current evaluation
|
161 |
with self.progress_lock:
|
|
|
193 |
self.progress = 10 # Loading dataset
|
194 |
|
195 |
# Load the dataset
|
196 |
+
try:
|
197 |
+
if config:
|
198 |
+
dataset = load_dataset(dataset_id, config, split="test")
|
199 |
+
else:
|
200 |
+
dataset = load_dataset(dataset_id, split="test")
|
201 |
+
except Exception as e:
|
202 |
+
return {"error": f"Failed to load dataset: {str(e)}"}
|
203 |
|
204 |
# Update progress
|
205 |
with self.progress_lock:
|
206 |
self.progress = 20 # Loading model
|
207 |
|
208 |
+
try:
|
209 |
+
# Load the model with memory optimization settings
|
210 |
+
device = "cpu"
|
211 |
+
model = AutoModelForCausalLM.from_pretrained(
|
212 |
+
model_id,
|
213 |
+
device_map=device,
|
214 |
+
torch_dtype=torch.float32, # Use float32 for CPU
|
215 |
+
low_cpu_mem_usage=True, # Enable memory optimization
|
216 |
+
offload_folder="offload", # Enable offloading if needed
|
217 |
+
offload_state_dict=True, # Offload state dict for memory saving
|
218 |
+
max_memory={0: f"{self.memory_limit_gb}GB"} # Limit memory usage
|
219 |
+
)
|
220 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
221 |
+
except Exception as e:
|
222 |
+
print(f"Model loading error: {e}")
|
223 |
+
return {"error": f"Failed to load model: {str(e)}"}
|
224 |
|
225 |
# Update progress
|
226 |
with self.progress_lock:
|
|
|
233 |
with self.progress_lock:
|
234 |
self.progress = 40 # Starting evaluation
|
235 |
|
236 |
+
try:
|
237 |
+
# Run appropriate evaluation based on task type
|
238 |
+
if task_type == "text-generation":
|
239 |
+
results = self._evaluate_text_generation(model, tokenizer, dataset)
|
240 |
+
elif task_type == "question-answering":
|
241 |
+
results = self._evaluate_question_answering(model, tokenizer, dataset)
|
242 |
+
elif task_type == "classification":
|
243 |
+
results = self._evaluate_classification(model, tokenizer, dataset)
|
244 |
+
elif task_type == "code-generation":
|
245 |
+
results = self._evaluate_code_generation(model, tokenizer, dataset)
|
246 |
+
else:
|
247 |
+
# Default to general evaluation
|
248 |
+
results = self._evaluate_general(model, tokenizer, dataset)
|
249 |
+
except Exception as e:
|
250 |
+
print(f"Evaluation task error: {e}")
|
251 |
+
return {"error": f"Evaluation failed: {str(e)}"}
|
252 |
|
253 |
# Update progress
|
254 |
with self.progress_lock:
|
|
|
465 |
# Update progress based on completion percentage
|
466 |
with self.progress_lock:
|
467 |
self.progress = 40 + int((i / len(dataset)) * 50)
|
468 |
+
|
|
|
469 |
text = example.get("text", example.get("sentence", ""))
|
470 |
label = str(example.get("label", example.get("class", "")))
|
471 |
|
|
|
680 |
Returns:
|
681 |
float: Overall score between 0 and 100
|
682 |
"""
|
683 |
+
# If there was an error, return a low score
|
684 |
+
if "error" in results:
|
685 |
+
return 0.0
|
686 |
+
|
687 |
score = 0.0
|
688 |
|
689 |
# Check for common metrics and weight them
|
|
|
727 |
return None, "Daily submission limit reached. Try again tomorrow."
|
728 |
|
729 |
try:
|
730 |
+
# Get model HuggingFace ID to check size
|
731 |
+
model_info = self.db_manager.get_model(model_id)
|
732 |
+
if not model_info:
|
733 |
+
return None, "Model not found in database."
|
734 |
+
|
735 |
+
# Check if model will fit in memory
|
736 |
+
will_fit, message = self.check_model_size(model_info['hf_model_id'])
|
737 |
+
|
738 |
+
if not will_fit:
|
739 |
+
return None, message
|
740 |
+
|
741 |
# Add evaluation to database and queue
|
742 |
evaluation_id = self.db_manager.add_evaluation(
|
743 |
model_id=model_id,
|
|
|
751 |
# Make sure worker is running
|
752 |
self.start_worker()
|
753 |
|
754 |
+
return evaluation_id, f"Evaluation submitted successfully. {message}"
|
755 |
except Exception as e:
|
756 |
print(f"Submit evaluation error: {e}")
|
757 |
return None, f"Failed to submit evaluation: {str(e)}"
|
|
|
779 |
"failed": len(failed_evals),
|
780 |
"is_processing": self.is_processing,
|
781 |
"current_evaluation": current_eval,
|
782 |
+
"progress": progress,
|
783 |
+
"memory_limit_gb": self.memory_limit_gb
|
784 |
}
|
785 |
except Exception as e:
|
786 |
print(f"Queue status error: {e}")
|
|
|
792 |
"is_processing": self.is_processing,
|
793 |
"current_evaluation": None,
|
794 |
"progress": 0,
|
795 |
+
"memory_limit_gb": self.memory_limit_gb,
|
796 |
"error": str(e)
|
797 |
}
|
798 |
|
|
|
810 |
"""
|
811 |
with gr.Blocks() as submission_ui:
|
812 |
with gr.Tab("Submit Model"):
|
813 |
+
gr.Markdown(f"""
|
814 |
+
### Model Size Restrictions
|
815 |
+
|
816 |
+
Models must fit within {evaluation_queue.memory_limit_gb}GB of RAM for evaluation.
|
817 |
+
Large models will be rejected to ensure all evaluations can complete successfully.
|
818 |
+
""", elem_classes=["info-text"])
|
819 |
+
|
820 |
with gr.Row():
|
821 |
with gr.Column(scale=2):
|
822 |
model_id_input = gr.Textbox(
|
|
|
824 |
label="Model ID"
|
825 |
)
|
826 |
|
827 |
+
check_size_button = gr.Button("Check Model Size")
|
828 |
+
size_check_result = gr.Markdown("")
|
829 |
+
|
830 |
model_name_input = gr.Textbox(
|
831 |
placeholder="Display name for your model",
|
832 |
label="Model Name"
|
|
|
882 |
progress_display = gr.Markdown("Progress: 0%")
|
883 |
|
884 |
# Event handlers
|
885 |
+
def check_model_size_handler(model_id):
|
886 |
+
if not model_id:
|
887 |
+
return "Please enter a HuggingFace model ID."
|
888 |
+
|
889 |
+
try:
|
890 |
+
will_fit, message = evaluation_queue.check_model_size(model_id)
|
891 |
+
|
892 |
+
if will_fit:
|
893 |
+
return f"✅ {message}"
|
894 |
+
else:
|
895 |
+
return f"❌ {message}"
|
896 |
+
except Exception as e:
|
897 |
+
return f"Error checking model size: {str(e)}"
|
898 |
+
|
899 |
def refresh_benchmarks_handler():
|
900 |
benchmarks = db_manager.get_benchmarks()
|
901 |
|
|
|
915 |
return "Please fill in all required fields."
|
916 |
|
917 |
try:
|
918 |
+
# Check if model will fit in RAM
|
919 |
+
will_fit, size_message = evaluation_queue.check_model_size(model_id)
|
920 |
+
|
921 |
+
if not will_fit:
|
922 |
+
return f"❌ {size_message}"
|
923 |
+
|
924 |
# Add model to database
|
925 |
model_db_id = db_manager.add_model(
|
926 |
name=model_name,
|
|
|
942 |
)
|
943 |
|
944 |
if eval_id:
|
945 |
+
return f"✅ Model submitted successfully. {size_message}\nEvaluation ID: {eval_id}"
|
946 |
else:
|
947 |
return message
|
948 |
except Exception as e:
|
|
|
980 |
return stats, eval_data, "No evaluation currently running", "Progress: 0%"
|
981 |
|
982 |
# Connect event handlers
|
983 |
+
check_size_button.click(
|
984 |
+
fn=check_model_size_handler,
|
985 |
+
inputs=[model_id_input],
|
986 |
+
outputs=[size_check_result]
|
987 |
+
)
|
988 |
+
|
989 |
refresh_benchmarks_button.click(
|
990 |
fn=refresh_benchmarks_handler,
|
991 |
inputs=[],
|
|
|
1017 |
inputs=[],
|
1018 |
outputs=[benchmark_dropdown]
|
1019 |
)
|
1020 |
+
|
1021 |
+
submission_ui.load(
|
1022 |
+
fn=refresh_queue_handler,
|
1023 |
+
inputs=[],
|
1024 |
+
outputs=[queue_stats, queue_status, current_eval_info, progress_display]
|
1025 |
+
)
|
1026 |
|
1027 |
return submission_ui
|