qLeaderboard-aBase4Community

Running

App Files Files Community

Quazim0t0 commited on Mar 21

Commit

c23f2ac

verified ·

1 Parent(s): d8a1516

Update evaluation_queue.py

Browse files

Files changed (1) hide show

evaluation_queue.py +161 -33

evaluation_queue.py CHANGED Viewed

@@ -12,7 +12,7 @@ import threading
 import queue as queue_module
 from datetime import datetime, timedelta
 import gradio as gr
-from huggingface_hub import HfApi, hf_hub_download, snapshot_download
 from datasets import load_dataset
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@@ -38,6 +38,8 @@ class EvaluationQueue:
         self.current_evaluation = None
         self.progress = 0
         self.progress_lock = threading.Lock()
     def start_worker(self):
         """Start the worker thread for processing the evaluation queue."""
@@ -53,6 +55,49 @@ class EvaluationQueue:
         if self.worker_thread and self.worker_thread.is_alive():
             self.worker_thread.join(timeout=1.0)
     def _process_queue(self):
         """Process the evaluation queue in a separate thread."""
         while self.is_processing:
@@ -78,6 +123,12 @@ class EvaluationQueue:
                         benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
                         if model_info and benchmark_info:
                             # Run the evaluation
                             results = self._run_evaluation(
                                 model_info['hf_model_id'],
@@ -98,8 +149,13 @@ class EvaluationQueue:
                             raise Exception("Model or benchmark not found")
                     except Exception as e:
                         print(f"Evaluation error: {e}")
-                        # Update status to failed
-                        self.db_manager.update_evaluation_status(next_eval['id'], 'failed')
                     # Clear current evaluation
                     with self.progress_lock:
@@ -137,24 +193,34 @@ class EvaluationQueue:
             self.progress = 10  # Loading dataset
         # Load the dataset
-        if config:
-            dataset = load_dataset(dataset_id, config, split="test")
-        else:
-            dataset = load_dataset(dataset_id, split="test")
         # Update progress
         with self.progress_lock:
             self.progress = 20  # Loading model
-        # Load the model (CPU only)
-        device = "cpu"
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            device_map=device,
-            torch_dtype=torch.float32,  # Use float32 for CPU
-            low_cpu_mem_usage=True
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
         # Update progress
         with self.progress_lock:
@@ -167,18 +233,22 @@ class EvaluationQueue:
         with self.progress_lock:
             self.progress = 40  # Starting evaluation
-        # Run appropriate evaluation based on task type
-        if task_type == "text-generation":
-            results = self._evaluate_text_generation(model, tokenizer, dataset)
-        elif task_type == "question-answering":
-            results = self._evaluate_question_answering(model, tokenizer, dataset)
-        elif task_type == "classification":
-            results = self._evaluate_classification(model, tokenizer, dataset)
-        elif task_type == "code-generation":
-            results = self._evaluate_code_generation(model, tokenizer, dataset)
-        else:
-            # Default to general evaluation
-            results = self._evaluate_general(model, tokenizer, dataset)
         # Update progress
         with self.progress_lock:
@@ -395,8 +465,7 @@ class EvaluationQueue:
             # Update progress based on completion percentage
             with self.progress_lock:
                 self.progress = 40 + int((i / len(dataset)) * 50)
             text = example.get("text", example.get("sentence", ""))
             label = str(example.get("label", example.get("class", "")))
@@ -611,6 +680,10 @@ class EvaluationQueue:
         Returns:
             float: Overall score between 0 and 100
         """
         score = 0.0
         # Check for common metrics and weight them
@@ -654,6 +727,17 @@ class EvaluationQueue:
             return None, "Daily submission limit reached. Try again tomorrow."
         try:
             # Add evaluation to database and queue
             evaluation_id = self.db_manager.add_evaluation(
                 model_id=model_id,
@@ -667,7 +751,7 @@ class EvaluationQueue:
             # Make sure worker is running
             self.start_worker()
-            return evaluation_id, "Evaluation submitted successfully."
         except Exception as e:
             print(f"Submit evaluation error: {e}")
             return None, f"Failed to submit evaluation: {str(e)}"
@@ -695,7 +779,8 @@ class EvaluationQueue:
                 "failed": len(failed_evals),
                 "is_processing": self.is_processing,
                 "current_evaluation": current_eval,
-                "progress": progress
             }
         except Exception as e:
             print(f"Queue status error: {e}")
@@ -707,6 +792,7 @@ class EvaluationQueue:
                 "is_processing": self.is_processing,
                 "current_evaluation": None,
                 "progress": 0,
                 "error": str(e)
             }
@@ -724,6 +810,13 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
     """
     with gr.Blocks() as submission_ui:
         with gr.Tab("Submit Model"):
             with gr.Row():
                 with gr.Column(scale=2):
                     model_id_input = gr.Textbox(
@@ -731,6 +824,9 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
                         label="Model ID"
                     )
                     model_name_input = gr.Textbox(
                         placeholder="Display name for your model",
                         label="Model Name"
@@ -786,6 +882,20 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
                     progress_display = gr.Markdown("Progress: 0%")
         # Event handlers
         def refresh_benchmarks_handler():
             benchmarks = db_manager.get_benchmarks()
@@ -805,6 +915,12 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
                 return "Please fill in all required fields."
             try:
                 # Add model to database
                 model_db_id = db_manager.add_model(
                     name=model_name,
@@ -826,7 +942,7 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
                 )
                 if eval_id:
-                    return f"Model submitted successfully. Evaluation ID: {eval_id}"
                 else:
                     return message
             except Exception as e:
@@ -864,6 +980,12 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
             return stats, eval_data, "No evaluation currently running", "Progress: 0%"
         # Connect event handlers
         refresh_benchmarks_button.click(
             fn=refresh_benchmarks_handler,
             inputs=[],
@@ -895,5 +1017,11 @@ def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
             inputs=[],
             outputs=[benchmark_dropdown]
         )
     return submission_ui

 import queue as queue_module
 from datetime import datetime, timedelta
 import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download, snapshot_download, model_info
 from datasets import load_dataset
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
         self.current_evaluation = None
         self.progress = 0
         self.progress_lock = threading.Lock()
+        # Memory limit for models in GB (leave 2GB for system)
+        self.memory_limit_gb = 14.0
     def start_worker(self):
         """Start the worker thread for processing the evaluation queue."""
         if self.worker_thread and self.worker_thread.is_alive():
             self.worker_thread.join(timeout=1.0)
+    def check_model_size(self, model_id):
+        """Check if a model will fit within RAM limitations.
+        Args:
+            model_id: HuggingFace model ID
+        Returns:
+            tuple: (will_fit, message)
+        """
+        try:
+            # Query model info from the HuggingFace API
+            model_info_obj = self.hf_api.model_info(model_id)
+            # Check if model size information is available
+            if hasattr(model_info_obj, 'safetensors') and model_info_obj.safetensors:
+                # Calculate size in GB (divided by 1024^3)
+                total_size_gb = sum(
+                    file.size for file in model_info_obj.safetensors.values()
+                ) / (1024 * 1024 * 1024)
+            elif hasattr(model_info_obj, 'siblings'):
+                # Legacy method - calculate from file siblings
+                total_size_gb = sum(
+                    sibling.size for sibling in model_info_obj.siblings
+                    if sibling.rfilename.endswith(('.bin', '.safetensors', '.pt'))
+                ) / (1024 * 1024 * 1024)
+            else:
+                # Can't determine size
+                return False, "Unable to determine model size. Please ensure model is under 14GB."
+            # Account for memory overhead (tokenizer, processing, etc.)
+            estimated_ram_needed = total_size_gb * 1.3  # 30% overhead
+            # Check against limit
+            if estimated_ram_needed > self.memory_limit_gb:
+                return False, f"Model is too large (approximately {total_size_gb:.1f}GB, needs {estimated_ram_needed:.1f}GB RAM). Maximum allowed is {self.memory_limit_gb}GB."
+            return True, f"Model size check passed ({total_size_gb:.1f}GB, estimated {estimated_ram_needed:.1f}GB RAM usage)"
+        except Exception as e:
+            print(f"Model size check error: {e}")
+            # If we can't check, be cautious
+            return False, f"Error checking model size: {str(e)}. Please ensure your model is under {self.memory_limit_gb}GB."
     def _process_queue(self):
         """Process the evaluation queue in a separate thread."""
         while self.is_processing:
                         benchmark_info = self.db_manager.get_benchmark(next_eval['benchmark_id'])
                         if model_info and benchmark_info:
+                            # Check if model will fit in memory
+                            will_fit, message = self.check_model_size(model_info['hf_model_id'])
+                            if not will_fit:
+                                raise Exception(f"Model too large for evaluation: {message}")
                             # Run the evaluation
                             results = self._run_evaluation(
                                 model_info['hf_model_id'],
                             raise Exception("Model or benchmark not found")
                     except Exception as e:
                         print(f"Evaluation error: {e}")
+                        # Update status to failed with error message
+                        error_results = {"error": str(e)}
+                        self.db_manager.update_evaluation_status(
+                            next_eval['id'],
+                            'failed',
+                            results=error_results
+                        )
                     # Clear current evaluation
                     with self.progress_lock:
             self.progress = 10  # Loading dataset
         # Load the dataset
+        try:
+            if config:
+                dataset = load_dataset(dataset_id, config, split="test")
+            else:
+                dataset = load_dataset(dataset_id, split="test")
+        except Exception as e:
+            return {"error": f"Failed to load dataset: {str(e)}"}
         # Update progress
         with self.progress_lock:
             self.progress = 20  # Loading model
+        try:
+            # Load the model with memory optimization settings
+            device = "cpu"
+            model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                device_map=device,
+                torch_dtype=torch.float32,  # Use float32 for CPU
+                low_cpu_mem_usage=True,     # Enable memory optimization
+                offload_folder="offload",   # Enable offloading if needed
+                offload_state_dict=True,    # Offload state dict for memory saving
+                max_memory={0: f"{self.memory_limit_gb}GB"}  # Limit memory usage
+            )
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+        except Exception as e:
+            print(f"Model loading error: {e}")
+            return {"error": f"Failed to load model: {str(e)}"}
         # Update progress
         with self.progress_lock:
         with self.progress_lock:
             self.progress = 40  # Starting evaluation
+        try:
+            # Run appropriate evaluation based on task type
+            if task_type == "text-generation":
+                results = self._evaluate_text_generation(model, tokenizer, dataset)
+            elif task_type == "question-answering":
+                results = self._evaluate_question_answering(model, tokenizer, dataset)
+            elif task_type == "classification":
+                results = self._evaluate_classification(model, tokenizer, dataset)
+            elif task_type == "code-generation":
+                results = self._evaluate_code_generation(model, tokenizer, dataset)
+            else:
+                # Default to general evaluation
+                results = self._evaluate_general(model, tokenizer, dataset)
+        except Exception as e:
+            print(f"Evaluation task error: {e}")
+            return {"error": f"Evaluation failed: {str(e)}"}
         # Update progress
         with self.progress_lock:
             # Update progress based on completion percentage
             with self.progress_lock:
                 self.progress = 40 + int((i / len(dataset)) * 50)
             text = example.get("text", example.get("sentence", ""))
             label = str(example.get("label", example.get("class", "")))
         Returns:
             float: Overall score between 0 and 100
         """
+        # If there was an error, return a low score
+        if "error" in results:
+            return 0.0
         score = 0.0
         # Check for common metrics and weight them
             return None, "Daily submission limit reached. Try again tomorrow."
         try:
+            # Get model HuggingFace ID to check size
+            model_info = self.db_manager.get_model(model_id)
+            if not model_info:
+                return None, "Model not found in database."
+            # Check if model will fit in memory
+            will_fit, message = self.check_model_size(model_info['hf_model_id'])
+            if not will_fit:
+                return None, message
             # Add evaluation to database and queue
             evaluation_id = self.db_manager.add_evaluation(
                 model_id=model_id,
             # Make sure worker is running
             self.start_worker()
+            return evaluation_id, f"Evaluation submitted successfully. {message}"
         except Exception as e:
             print(f"Submit evaluation error: {e}")
             return None, f"Failed to submit evaluation: {str(e)}"
                 "failed": len(failed_evals),
                 "is_processing": self.is_processing,
                 "current_evaluation": current_eval,
+                "progress": progress,
+                "memory_limit_gb": self.memory_limit_gb
             }
         except Exception as e:
             print(f"Queue status error: {e}")
                 "is_processing": self.is_processing,
                 "current_evaluation": None,
                 "progress": 0,
+                "memory_limit_gb": self.memory_limit_gb,
                 "error": str(e)
             }
     """
     with gr.Blocks() as submission_ui:
         with gr.Tab("Submit Model"):
+            gr.Markdown(f"""
+            ### Model Size Restrictions
+            Models must fit within {evaluation_queue.memory_limit_gb}GB of RAM for evaluation.
+            Large models will be rejected to ensure all evaluations can complete successfully.
+            """, elem_classes=["info-text"])
             with gr.Row():
                 with gr.Column(scale=2):
                     model_id_input = gr.Textbox(
                         label="Model ID"
                     )
+                    check_size_button = gr.Button("Check Model Size")
+                    size_check_result = gr.Markdown("")
                     model_name_input = gr.Textbox(
                         placeholder="Display name for your model",
                         label="Model Name"
                     progress_display = gr.Markdown("Progress: 0%")
         # Event handlers
+        def check_model_size_handler(model_id):
+            if not model_id:
+                return "Please enter a HuggingFace model ID."
+            try:
+                will_fit, message = evaluation_queue.check_model_size(model_id)
+                if will_fit:
+                    return f"✅ {message}"
+                else:
+                    return f"❌ {message}"
+            except Exception as e:
+                return f"Error checking model size: {str(e)}"
         def refresh_benchmarks_handler():
             benchmarks = db_manager.get_benchmarks()
                 return "Please fill in all required fields."
             try:
+                # Check if model will fit in RAM
+                will_fit, size_message = evaluation_queue.check_model_size(model_id)
+                if not will_fit:
+                    return f"❌ {size_message}"
                 # Add model to database
                 model_db_id = db_manager.add_model(
                     name=model_name,
                 )
                 if eval_id:
+                    return f"✅ Model submitted successfully. {size_message}\nEvaluation ID: {eval_id}"
                 else:
                     return message
             except Exception as e:
             return stats, eval_data, "No evaluation currently running", "Progress: 0%"
         # Connect event handlers
+        check_size_button.click(
+            fn=check_model_size_handler,
+            inputs=[model_id_input],
+            outputs=[size_check_result]
+        )
         refresh_benchmarks_button.click(
             fn=refresh_benchmarks_handler,
             inputs=[],
             inputs=[],
             outputs=[benchmark_dropdown]
         )
+        submission_ui.load(
+            fn=refresh_queue_handler,
+            inputs=[],
+            outputs=[queue_stats, queue_status, current_eval_info, progress_display]
+        )
     return submission_ui