Quazim0t0 commited on
Commit
c23f2ac
·
verified ·
1 Parent(s): d8a1516

Update evaluation_queue.py

Browse files
Files changed (1) hide show
  1. evaluation_queue.py +161 -33
evaluation_queue.py CHANGED
@@ -12,7 +12,7 @@ import threading
12
  import queue as queue_module
13
  from datetime import datetime, timedelta
14
  import gradio as gr
15
- from huggingface_hub import HfApi, hf_hub_download, snapshot_download
16
  from datasets import load_dataset
17
  import torch
18
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@@ -38,6 +38,8 @@ class EvaluationQueue:
38
  self.current_evaluation = None
39
  self.progress = 0
40
  self.progress_lock = threading.Lock()
 
 
41
 
42
  def start_worker(self):
43
  """Start the worker thread for processing the evaluation queue."""
@@ -53,6 +55,49 @@ class EvaluationQueue:
53
  if self.worker_thread and self.worker_thread.is_alive():
54
  self.worker_thread.join(timeout=1.0)
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def _process_queue(self):
57
  """Process the evaluation queue in a separate thread."""
58
  while self.is_processing:
@@ -78,6 +123,12 @@ class EvaluationQueue:
78
  benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
79
 
80
  if model_info and benchmark_info:
 
 
 
 
 
 
81
  # Run the evaluation
82
  results = self._run_evaluation(
83
  model_info['hf_model_id'],
@@ -98,8 +149,13 @@ class EvaluationQueue:
98
  raise Exception("Model or benchmark not found")
99
  except Exception as e:
100
  print(f"Evaluation error: {e}")
101
- # Update status to failed
102
- self.db_manager.update_evaluation_status(next_eval['id'], 'failed')
 
 
 
 
 
103
 
104
  # Clear current evaluation
105
  with self.progress_lock:
@@ -137,24 +193,34 @@ class EvaluationQueue:
137
  self.progress = 10 # Loading dataset
138
 
139
  # Load the dataset
140
- if config:
141
- dataset = load_dataset(dataset_id, config, split="test")
142
- else:
143
- dataset = load_dataset(dataset_id, split="test")
 
 
 
144
 
145
  # Update progress
146
  with self.progress_lock:
147
  self.progress = 20 # Loading model
148
 
149
- # Load the model (CPU only)
150
- device = "cpu"
151
- model = AutoModelForCausalLM.from_pretrained(
152
- model_id,
153
- device_map=device,
154
- torch_dtype=torch.float32, # Use float32 for CPU
155
- low_cpu_mem_usage=True
156
- )
157
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
 
 
 
 
158
 
159
  # Update progress
160
  with self.progress_lock:
@@ -167,18 +233,22 @@ class EvaluationQueue:
167
  with self.progress_lock:
168
  self.progress = 40 # Starting evaluation
169
 
170
- # Run appropriate evaluation based on task type
171
- if task_type == "text-generation":
172
- results = self._evaluate_text_generation(model, tokenizer, dataset)
173
- elif task_type == "question-answering":
174
- results = self._evaluate_question_answering(model, tokenizer, dataset)
175
- elif task_type == "classification":
176
- results = self._evaluate_classification(model, tokenizer, dataset)
177
- elif task_type == "code-generation":
178
- results = self._evaluate_code_generation(model, tokenizer, dataset)
179
- else:
180
- # Default to general evaluation
181
- results = self._evaluate_general(model, tokenizer, dataset)
 
 
 
 
182
 
183
  # Update progress
184
  with self.progress_lock:
@@ -395,8 +465,7 @@ class EvaluationQueue:
395
  # Update progress based on completion percentage
396
  with self.progress_lock:
397
  self.progress = 40 + int((i / len(dataset)) * 50)
398
-
399
-
400
  text = example.get("text", example.get("sentence", ""))
401
  label = str(example.get("label", example.get("class", "")))
402
 
@@ -611,6 +680,10 @@ class EvaluationQueue:
611
  Returns:
612
  float: Overall score between 0 and 100
613
  """
 
 
 
 
614
  score = 0.0
615
 
616
  # Check for common metrics and weight them
@@ -654,6 +727,17 @@ class EvaluationQueue:
654
  return None, "Daily submission limit reached. Try again tomorrow."
655
 
656
  try:
 
 
 
 
 
 
 
 
 
 
 
657
  # Add evaluation to database and queue
658
  evaluation_id = self.db_manager.add_evaluation(
659
  model_id=model_id,
@@ -667,7 +751,7 @@ class EvaluationQueue:
667
  # Make sure worker is running
668
  self.start_worker()
669
 
670
- return evaluation_id, "Evaluation submitted successfully."
671
  except Exception as e:
672
  print(f"Submit evaluation error: {e}")
673
  return None, f"Failed to submit evaluation: {str(e)}"
@@ -695,7 +779,8 @@ class EvaluationQueue:
695
  "failed": len(failed_evals),
696
  "is_processing": self.is_processing,
697
  "current_evaluation": current_eval,
698
- "progress": progress
 
699
  }
700
  except Exception as e:
701
  print(f"Queue status error: {e}")
@@ -707,6 +792,7 @@ class EvaluationQueue:
707
  "is_processing": self.is_processing,
708
  "current_evaluation": None,
709
  "progress": 0,
 
710
  "error": str(e)
711
  }
712
 
@@ -724,6 +810,13 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
724
  """
725
  with gr.Blocks() as submission_ui:
726
  with gr.Tab("Submit Model"):
 
 
 
 
 
 
 
727
  with gr.Row():
728
  with gr.Column(scale=2):
729
  model_id_input = gr.Textbox(
@@ -731,6 +824,9 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
731
  label="Model ID"
732
  )
733
 
 
 
 
734
  model_name_input = gr.Textbox(
735
  placeholder="Display name for your model",
736
  label="Model Name"
@@ -786,6 +882,20 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
786
  progress_display = gr.Markdown("Progress: 0%")
787
 
788
  # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  def refresh_benchmarks_handler():
790
  benchmarks = db_manager.get_benchmarks()
791
 
@@ -805,6 +915,12 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
805
  return "Please fill in all required fields."
806
 
807
  try:
 
 
 
 
 
 
808
  # Add model to database
809
  model_db_id = db_manager.add_model(
810
  name=model_name,
@@ -826,7 +942,7 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
826
  )
827
 
828
  if eval_id:
829
- return f"Model submitted successfully. Evaluation ID: {eval_id}"
830
  else:
831
  return message
832
  except Exception as e:
@@ -864,6 +980,12 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
864
  return stats, eval_data, "No evaluation currently running", "Progress: 0%"
865
 
866
  # Connect event handlers
 
 
 
 
 
 
867
  refresh_benchmarks_button.click(
868
  fn=refresh_benchmarks_handler,
869
  inputs=[],
@@ -895,5 +1017,11 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
895
  inputs=[],
896
  outputs=[benchmark_dropdown]
897
  )
 
 
 
 
 
 
898
 
899
  return submission_ui
 
12
  import queue as queue_module
13
  from datetime import datetime, timedelta
14
  import gradio as gr
15
+ from huggingface_hub import HfApi, hf_hub_download, snapshot_download, model_info
16
  from datasets import load_dataset
17
  import torch
18
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
38
  self.current_evaluation = None
39
  self.progress = 0
40
  self.progress_lock = threading.Lock()
41
+ # Memory limit for models in GB (leave 2GB for system)
42
+ self.memory_limit_gb = 14.0
43
 
44
  def start_worker(self):
45
  """Start the worker thread for processing the evaluation queue."""
 
55
  if self.worker_thread and self.worker_thread.is_alive():
56
  self.worker_thread.join(timeout=1.0)
57
 
58
+ def check_model_size(self, model_id):
59
+ """Check if a model will fit within RAM limitations.
60
+
61
+ Args:
62
+ model_id: HuggingFace model ID
63
+
64
+ Returns:
65
+ tuple: (will_fit, message)
66
+ """
67
+ try:
68
+ # Query model info from the HuggingFace API
69
+ model_info_obj = self.hf_api.model_info(model_id)
70
+
71
+ # Check if model size information is available
72
+ if hasattr(model_info_obj, 'safetensors') and model_info_obj.safetensors:
73
+ # Calculate size in GB (divided by 1024^3)
74
+ total_size_gb = sum(
75
+ file.size for file in model_info_obj.safetensors.values()
76
+ ) / (1024 * 1024 * 1024)
77
+ elif hasattr(model_info_obj, 'siblings'):
78
+ # Legacy method - calculate from file siblings
79
+ total_size_gb = sum(
80
+ sibling.size for sibling in model_info_obj.siblings
81
+ if sibling.rfilename.endswith(('.bin', '.safetensors', '.pt'))
82
+ ) / (1024 * 1024 * 1024)
83
+ else:
84
+ # Can't determine size
85
+ return False, "Unable to determine model size. Please ensure model is under 14GB."
86
+
87
+ # Account for memory overhead (tokenizer, processing, etc.)
88
+ estimated_ram_needed = total_size_gb * 1.3 # 30% overhead
89
+
90
+ # Check against limit
91
+ if estimated_ram_needed > self.memory_limit_gb:
92
+ return False, f"Model is too large (approximately {total_size_gb:.1f}GB, needs {estimated_ram_needed:.1f}GB RAM). Maximum allowed is {self.memory_limit_gb}GB."
93
+
94
+ return True, f"Model size check passed ({total_size_gb:.1f}GB, estimated {estimated_ram_needed:.1f}GB RAM usage)"
95
+
96
+ except Exception as e:
97
+ print(f"Model size check error: {e}")
98
+ # If we can't check, be cautious
99
+ return False, f"Error checking model size: {str(e)}. Please ensure your model is under {self.memory_limit_gb}GB."
100
+
101
  def _process_queue(self):
102
  """Process the evaluation queue in a separate thread."""
103
  while self.is_processing:
 
123
  benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
124
 
125
  if model_info and benchmark_info:
126
+ # Check if model will fit in memory
127
+ will_fit, message = self.check_model_size(model_info['hf_model_id'])
128
+
129
+ if not will_fit:
130
+ raise Exception(f"Model too large for evaluation: {message}")
131
+
132
  # Run the evaluation
133
  results = self._run_evaluation(
134
  model_info['hf_model_id'],
 
149
  raise Exception("Model or benchmark not found")
150
  except Exception as e:
151
  print(f"Evaluation error: {e}")
152
+ # Update status to failed with error message
153
+ error_results = {"error": str(e)}
154
+ self.db_manager.update_evaluation_status(
155
+ next_eval['id'],
156
+ 'failed',
157
+ results=error_results
158
+ )
159
 
160
  # Clear current evaluation
161
  with self.progress_lock:
 
193
  self.progress = 10 # Loading dataset
194
 
195
  # Load the dataset
196
+ try:
197
+ if config:
198
+ dataset = load_dataset(dataset_id, config, split="test")
199
+ else:
200
+ dataset = load_dataset(dataset_id, split="test")
201
+ except Exception as e:
202
+ return {"error": f"Failed to load dataset: {str(e)}"}
203
 
204
  # Update progress
205
  with self.progress_lock:
206
  self.progress = 20 # Loading model
207
 
208
+ try:
209
+ # Load the model with memory optimization settings
210
+ device = "cpu"
211
+ model = AutoModelForCausalLM.from_pretrained(
212
+ model_id,
213
+ device_map=device,
214
+ torch_dtype=torch.float32, # Use float32 for CPU
215
+ low_cpu_mem_usage=True, # Enable memory optimization
216
+ offload_folder="offload", # Enable offloading if needed
217
+ offload_state_dict=True, # Offload state dict for memory saving
218
+ max_memory={0: f"{self.memory_limit_gb}GB"} # Limit memory usage
219
+ )
220
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
221
+ except Exception as e:
222
+ print(f"Model loading error: {e}")
223
+ return {"error": f"Failed to load model: {str(e)}"}
224
 
225
  # Update progress
226
  with self.progress_lock:
 
233
  with self.progress_lock:
234
  self.progress = 40 # Starting evaluation
235
 
236
+ try:
237
+ # Run appropriate evaluation based on task type
238
+ if task_type == "text-generation":
239
+ results = self._evaluate_text_generation(model, tokenizer, dataset)
240
+ elif task_type == "question-answering":
241
+ results = self._evaluate_question_answering(model, tokenizer, dataset)
242
+ elif task_type == "classification":
243
+ results = self._evaluate_classification(model, tokenizer, dataset)
244
+ elif task_type == "code-generation":
245
+ results = self._evaluate_code_generation(model, tokenizer, dataset)
246
+ else:
247
+ # Default to general evaluation
248
+ results = self._evaluate_general(model, tokenizer, dataset)
249
+ except Exception as e:
250
+ print(f"Evaluation task error: {e}")
251
+ return {"error": f"Evaluation failed: {str(e)}"}
252
 
253
  # Update progress
254
  with self.progress_lock:
 
465
  # Update progress based on completion percentage
466
  with self.progress_lock:
467
  self.progress = 40 + int((i / len(dataset)) * 50)
468
+
 
469
  text = example.get("text", example.get("sentence", ""))
470
  label = str(example.get("label", example.get("class", "")))
471
 
 
680
  Returns:
681
  float: Overall score between 0 and 100
682
  """
683
+ # If there was an error, return a low score
684
+ if "error" in results:
685
+ return 0.0
686
+
687
  score = 0.0
688
 
689
  # Check for common metrics and weight them
 
727
  return None, "Daily submission limit reached. Try again tomorrow."
728
 
729
  try:
730
+ # Get model HuggingFace ID to check size
731
+ model_info = self.db_manager.get_model(model_id)
732
+ if not model_info:
733
+ return None, "Model not found in database."
734
+
735
+ # Check if model will fit in memory
736
+ will_fit, message = self.check_model_size(model_info['hf_model_id'])
737
+
738
+ if not will_fit:
739
+ return None, message
740
+
741
  # Add evaluation to database and queue
742
  evaluation_id = self.db_manager.add_evaluation(
743
  model_id=model_id,
 
751
  # Make sure worker is running
752
  self.start_worker()
753
 
754
+ return evaluation_id, f"Evaluation submitted successfully. {message}"
755
  except Exception as e:
756
  print(f"Submit evaluation error: {e}")
757
  return None, f"Failed to submit evaluation: {str(e)}"
 
779
  "failed": len(failed_evals),
780
  "is_processing": self.is_processing,
781
  "current_evaluation": current_eval,
782
+ "progress": progress,
783
+ "memory_limit_gb": self.memory_limit_gb
784
  }
785
  except Exception as e:
786
  print(f"Queue status error: {e}")
 
792
  "is_processing": self.is_processing,
793
  "current_evaluation": None,
794
  "progress": 0,
795
+ "memory_limit_gb": self.memory_limit_gb,
796
  "error": str(e)
797
  }
798
 
 
810
  """
811
  with gr.Blocks() as submission_ui:
812
  with gr.Tab("Submit Model"):
813
+ gr.Markdown(f"""
814
+ ### Model Size Restrictions
815
+
816
+ Models must fit within {evaluation_queue.memory_limit_gb}GB of RAM for evaluation.
817
+ Large models will be rejected to ensure all evaluations can complete successfully.
818
+ """, elem_classes=["info-text"])
819
+
820
  with gr.Row():
821
  with gr.Column(scale=2):
822
  model_id_input = gr.Textbox(
 
824
  label="Model ID"
825
  )
826
 
827
+ check_size_button = gr.Button("Check Model Size")
828
+ size_check_result = gr.Markdown("")
829
+
830
  model_name_input = gr.Textbox(
831
  placeholder="Display name for your model",
832
  label="Model Name"
 
882
  progress_display = gr.Markdown("Progress: 0%")
883
 
884
  # Event handlers
885
+ def check_model_size_handler(model_id):
886
+ if not model_id:
887
+ return "Please enter a HuggingFace model ID."
888
+
889
+ try:
890
+ will_fit, message = evaluation_queue.check_model_size(model_id)
891
+
892
+ if will_fit:
893
+ return f"✅ {message}"
894
+ else:
895
+ return f"❌ {message}"
896
+ except Exception as e:
897
+ return f"Error checking model size: {str(e)}"
898
+
899
  def refresh_benchmarks_handler():
900
  benchmarks = db_manager.get_benchmarks()
901
 
 
915
  return "Please fill in all required fields."
916
 
917
  try:
918
+ # Check if model will fit in RAM
919
+ will_fit, size_message = evaluation_queue.check_model_size(model_id)
920
+
921
+ if not will_fit:
922
+ return f"❌ {size_message}"
923
+
924
  # Add model to database
925
  model_db_id = db_manager.add_model(
926
  name=model_name,
 
942
  )
943
 
944
  if eval_id:
945
+ return f"Model submitted successfully. {size_message}\nEvaluation ID: {eval_id}"
946
  else:
947
  return message
948
  except Exception as e:
 
980
  return stats, eval_data, "No evaluation currently running", "Progress: 0%"
981
 
982
  # Connect event handlers
983
+ check_size_button.click(
984
+ fn=check_model_size_handler,
985
+ inputs=[model_id_input],
986
+ outputs=[size_check_result]
987
+ )
988
+
989
  refresh_benchmarks_button.click(
990
  fn=refresh_benchmarks_handler,
991
  inputs=[],
 
1017
  inputs=[],
1018
  outputs=[benchmark_dropdown]
1019
  )
1020
+
1021
+ submission_ui.load(
1022
+ fn=refresh_queue_handler,
1023
+ inputs=[],
1024
+ outputs=[queue_stats, queue_status, current_eval_info, progress_display]
1025
+ )
1026
 
1027
  return submission_ui