qLeaderboard-aBase4Community

Running

App Files Files Community

Quazim0t0 commited on Mar 21

Commit

d1078a3

verified ·

1 Parent(s): e14f110

Upload 3 files

Browse files

Files changed (3) hide show

database_schema.py +393 -0
evaluation_queue.py +947 -0
leaderboard.py +381 -0

database_schema.py ADDED Viewed

	@@ -0,0 +1,393 @@

+"""
+Database schema for Dynamic Highscores system.
+This module defines the SQLite database schema for the Dynamic Highscores system,
+which integrates benchmark selection, model evaluation, and leaderboard functionality.
+"""
+import sqlite3
+import os
+import json
+from datetime import datetime, timedelta
+import pandas as pd
+class DynamicHighscoresDB:
+    """Database manager for the Dynamic Highscores system."""
+    def __init__(self, db_path="dynamic_highscores.db"):
+        """Initialize the database connection and create tables if they don't exist."""
+        self.db_path = db_path
+        self.conn = None
+        self.cursor = None
+        self.connect()
+        self.create_tables()
+    def connect(self):
+        """Connect to the SQLite database."""
+        self.conn = sqlite3.connect(self.db_path)
+        self.conn.row_factory = sqlite3.Row
+        self.cursor = self.conn.cursor()
+    def close(self):
+        """Close the database connection."""
+        if self.conn:
+            self.conn.close()
+    def create_tables(self):
+        """Create all necessary tables if they don't exist."""
+        # Users table - stores user information
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS users (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            username TEXT UNIQUE NOT NULL,
+            hf_user_id TEXT UNIQUE NOT NULL,
+            is_admin BOOLEAN DEFAULT 0,
+            last_submission_date TEXT,
+            created_at TEXT DEFAULT CURRENT_TIMESTAMP
+        )
+        ''')
+        # Benchmarks table - stores information about available benchmarks
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS benchmarks (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL,
+            dataset_id TEXT NOT NULL,
+            description TEXT,
+            metrics TEXT,  -- JSON string of metrics
+            created_at TEXT DEFAULT CURRENT_TIMESTAMP
+        )
+        ''')
+        # Models table - stores information about submitted models
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS models (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT NOT NULL,
+            hf_model_id TEXT NOT NULL,
+            user_id INTEGER NOT NULL,
+            tag TEXT NOT NULL,  -- One of: Merge, Agent, Reasoning, Coding, etc.
+            parameters TEXT,  -- Number of parameters (can be NULL)
+            description TEXT,
+            created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (user_id) REFERENCES users (id),
+            UNIQUE (hf_model_id, user_id)
+        )
+        ''')
+        # Evaluations table - stores evaluation results
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS evaluations (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            model_id INTEGER NOT NULL,
+            benchmark_id INTEGER NOT NULL,
+            status TEXT NOT NULL,  -- pending, running, completed, failed
+            results TEXT,  -- JSON string of results
+            score REAL,  -- Overall score (can be NULL)
+            submitted_at TEXT DEFAULT CURRENT_TIMESTAMP,
+            completed_at TEXT,
+            FOREIGN KEY (model_id) REFERENCES models (id),
+            FOREIGN KEY (benchmark_id) REFERENCES benchmarks (id)
+        )
+        ''')
+        # Queue table - stores evaluation queue
+        self.cursor.execute('''
+        CREATE TABLE IF NOT EXISTS queue (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            evaluation_id INTEGER NOT NULL,
+            priority INTEGER DEFAULT 0,  -- Higher number = higher priority
+            added_at TEXT DEFAULT CURRENT_TIMESTAMP,
+            FOREIGN KEY (evaluation_id) REFERENCES evaluations (id)
+        )
+        ''')
+        self.conn.commit()
+    # User management methods
+    def add_user(self, username, hf_user_id, is_admin=False):
+        """Add a new user to the database."""
+        try:
+            self.cursor.execute(
+                "INSERT INTO users (username, hf_user_id, is_admin) VALUES (?, ?, ?)",
+                (username, hf_user_id, is_admin)
+            )
+            self.conn.commit()
+            return self.cursor.lastrowid
+        except sqlite3.IntegrityError:
+            # User already exists
+            self.cursor.execute(
+                "SELECT id FROM users WHERE hf_user_id = ?",
+                (hf_user_id,)
+            )
+            return self.cursor.fetchone()[0]
+    def get_user(self, hf_user_id):
+        """Get user information by HuggingFace user ID."""
+        self.cursor.execute(
+            "SELECT * FROM users WHERE hf_user_id = ?",
+            (hf_user_id,)
+        )
+        return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
+    def can_submit_today(self, user_id):
+        """Check if a user can submit a benchmark evaluation today."""
+        self.cursor.execute(
+            "SELECT is_admin, last_submission_date FROM users WHERE id = ?",
+            (user_id,)
+        )
+        result = self.cursor.fetchone()
+        if not result:
+            return False
+        user_data = dict(result)
+        # Admin can always submit
+        if user_data['is_admin']:
+            return True
+        # If no previous submission, user can submit
+        if not user_data['last_submission_date']:
+            return True
+        # Check if last submission was before today
+        last_date = datetime.fromisoformat(user_data['last_submission_date'])
+        today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+        return last_date < today
+    def update_submission_date(self, user_id):
+        """Update the last submission date for a user."""
+        current_time = datetime.now().isoformat()
+        self.cursor.execute(
+            "UPDATE users SET last_submission_date = ? WHERE id = ?",
+            (current_time, user_id)
+        )
+        self.conn.commit()
+    # Benchmark management methods
+    def add_benchmark(self, name, dataset_id, description="", metrics=None):
+        """Add a new benchmark to the database."""
+        if metrics is None:
+            metrics = {}
+        metrics_json = json.dumps(metrics)
+        try:
+            self.cursor.execute(
+                "INSERT INTO benchmarks (name, dataset_id, description, metrics) VALUES (?, ?, ?, ?)",
+                (name, dataset_id, description, metrics_json)
+            )
+            self.conn.commit()
+            return self.cursor.lastrowid
+        except sqlite3.IntegrityError:
+            # Benchmark already exists with this dataset_id
+            self.cursor.execute(
+                "SELECT id FROM benchmarks WHERE dataset_id = ?",
+                (dataset_id,)
+            )
+            return self.cursor.fetchone()[0]
+    def get_benchmarks(self):
+        """Get all available benchmarks."""
+        self.cursor.execute("SELECT * FROM benchmarks")
+        benchmarks = [dict(row) for row in self.cursor.fetchall()]
+        # Parse metrics JSON
+        for benchmark in benchmarks:
+            benchmark['metrics'] = json.loads(benchmark['metrics'])
+        return benchmarks
+    def get_benchmark(self, benchmark_id):
+        """Get benchmark information by ID."""
+        self.cursor.execute(
+            "SELECT * FROM benchmarks WHERE id = ?",
+            (benchmark_id,)
+        )
+        benchmark = dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
+        if benchmark:
+            benchmark['metrics'] = json.loads(benchmark['metrics'])
+        return benchmark
+    # Model management methods
+    def add_model(self, name, hf_model_id, user_id, tag, parameters=None, description=""):
+        """Add a new model to the database."""
+        try:
+            self.cursor.execute(
+                "INSERT INTO models (name, hf_model_id, user_id, tag, parameters, description) VALUES (?, ?, ?, ?, ?, ?)",
+                (name, hf_model_id, user_id, tag, parameters, description)
+            )
+            self.conn.commit()
+            return self.cursor.lastrowid
+        except sqlite3.IntegrityError:
+            # Model already exists for this user
+            self.cursor.execute(
+                "SELECT id FROM models WHERE hf_model_id = ? AND user_id = ?",
+                (hf_model_id, user_id)
+            )
+            return self.cursor.fetchone()[0]
+    def get_models(self, tag=None):
+        """Get all models, optionally filtered by tag."""
+        if tag:
+            self.cursor.execute(
+                "SELECT * FROM models WHERE tag = ?",
+                (tag,)
+            )
+        else:
+            self.cursor.execute("SELECT * FROM models")
+        return [dict(row) for row in self.cursor.fetchall()]
+    def get_model(self, model_id):
+        """Get model information by ID."""
+        self.cursor.execute(
+            "SELECT * FROM models WHERE id = ?",
+            (model_id,)
+        )
+        return dict(self.cursor.fetchone()) if self.cursor.fetchone() else None
+    # Evaluation management methods
+    def add_evaluation(self, model_id, benchmark_id, priority=0):
+        """Add a new evaluation to the database and queue."""
+        # First, add the evaluation
+        self.cursor.execute(
+            "INSERT INTO evaluations (model_id, benchmark_id, status) VALUES (?, ?, 'pending')",
+            (model_id, benchmark_id)
+        )
+        evaluation_id = self.cursor.lastrowid
+        # Then, add it to the queue
+        self.cursor.execute(
+            "INSERT INTO queue (evaluation_id, priority) VALUES (?, ?)",
+            (evaluation_id, priority)
+        )
+        self.conn.commit()
+        return evaluation_id
+    def update_evaluation_status(self, evaluation_id, status, results=None, score=None):
+        """Update the status of an evaluation."""
+        params = [status, evaluation_id]
+        sql = "UPDATE evaluations SET status = ?"
+        if results is not None:
+            sql += ", results = ?"
+            params.insert(1, json.dumps(results))
+        if score is not None:
+            sql += ", score = ?"
+            params.insert(1 if results is None else 2, score)
+        if status in ['completed', 'failed']:
+            sql += ", completed_at = ?"
+            params.insert(1 if results is None and score is None else (2 if results is None or score is None else 3),
+                         datetime.now().isoformat())
+        sql += " WHERE id = ?"
+        self.cursor.execute(sql, params)
+        self.conn.commit()
+        # If completed or failed, remove from queue
+        if status in ['completed', 'failed']:
+            self.cursor.execute(
+                "DELETE FROM queue WHERE evaluation_id = ?",
+                (evaluation_id,)
+            )
+            self.conn.commit()
+    def get_next_in_queue(self):
+        """Get the next evaluation in the queue."""
+        self.cursor.execute("""
+            SELECT q.id as queue_id, q.evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
+            FROM queue q
+            JOIN evaluations e ON q.evaluation_id = e.id
+            JOIN models m ON e.model_id = m.id
+            JOIN benchmarks b ON e.benchmark_id = b.id
+            WHERE e.status = 'pending'
+            ORDER BY q.priority DESC, q.added_at ASC
+            LIMIT 1
+        """)
+        result = self.cursor.fetchone()
+        return dict(result) if result else None
+    def get_evaluation_results(self, model_id=None, benchmark_id=None, tag=None):
+        """Get evaluation results, optionally filtered by model, benchmark, or tag."""
+        sql = """
+            SELECT e.id, e.model_id, e.benchmark_id, e.status, e.results, e.score,
+                   e.submitted_at, e.completed_at, m.name as model_name, m.tag,
+                   b.name as benchmark_name
+            FROM evaluations e
+            JOIN models m ON e.model_id = m.id
+            JOIN benchmarks b ON e.benchmark_id = b.id
+            WHERE e.status = 'completed'
+        """
+        params = []
+        if model_id:
+            sql += " AND e.model_id = ?"
+            params.append(model_id)
+        if benchmark_id:
+            sql += " AND e.benchmark_id = ?"
+            params.append(benchmark_id)
+        if tag:
+            sql += " AND m.tag = ?"
+            params.append(tag)
+        sql += " ORDER BY e.completed_at DESC"
+        self.cursor.execute(sql, params)
+        results = [dict(row) for row in self.cursor.fetchall()]
+        # Parse results JSON
+        for result in results:
+            if result['results']:
+                result['results'] = json.loads(result['results'])
+        return results
+    def get_leaderboard_df(self, tag=None):
+        """Get a pandas DataFrame of the leaderboard, optionally filtered by tag."""
+        results = self.get_evaluation_results(tag=tag)
+        if not results:
+            return pd.DataFrame()
+        # Create a list of dictionaries for the DataFrame
+        leaderboard_data = []
+        for result in results:
+            entry = {
+                'model_name': result['model_name'],
+                'model_id': result['model_id'],
+                'benchmark_name': result['benchmark_name'],
+                'benchmark_id': result['benchmark_id'],
+                'tag': result['tag'],
+                'score': result['score'],
+                'completed_at': result['completed_at']
+            }
+            # Add individual metrics from results
+            if result['results'] and isinstance(result['results'], dict):
+                for metric, value in result['results'].items():
+                    if isinstance(value, (int, float)):
+                        entry[f'metric_{metric}'] = value
+            leaderboard_data.append(entry)
+        return pd.DataFrame(leaderboard_data)
+# Initialize the database
+def init_db(db_path="dynamic_highscores.db"):
+    """Initialize the database and return the database manager."""
+    db = DynamicHighscoresDB(db_path)
+    return db

evaluation_queue.py ADDED Viewed

	@@ -0,0 +1,947 @@

+"""
+Model evaluation queue system for Dynamic Highscores.
+This module handles the evaluation queue, CPU-only processing,
+and enforces daily submission limits for users.
+"""
+import os
+import json
+import time
+import threading
+import queue
+from datetime import datetime, timedelta
+import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download, snapshot_download
+from datasets import load_dataset
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import sqlite3
+class EvaluationQueue:
+    """Manages the evaluation queue for model benchmarking."""
+    def __init__(self, db_manager, auth_manager):
+        """Initialize the evaluation queue manager.
+        Args:
+            db_manager: Database manager instance
+            auth_manager: Authentication manager instance
+        """
+        self.db_manager = db_manager
+        self.auth_manager = auth_manager
+        self.hf_api = HfApi()
+        self.queue = queue.Queue()
+        self.is_processing = False
+        self.worker_thread = None
+        self.model_tags = ["Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
+        self.current_evaluation = None
+        self.progress = 0
+        self.progress_lock = threading.Lock()
+        self.db_path = db_manager.db_path  # Store the path to create new connections in worker thread
+    def start_worker(self):
+        """Start the worker thread for processing the evaluation queue."""
+        if self.worker_thread is None or not self.worker_thread.is_alive():
+            self.is_processing = True
+            self.worker_thread = threading.Thread(target=self._process_queue)
+            self.worker_thread.daemon = True
+            self.worker_thread.start()
+    def stop_worker(self):
+        """Stop the worker thread."""
+        self.is_processing = False
+        if self.worker_thread and self.worker_thread.is_alive():
+            self.worker_thread.join(timeout=1.0)
+    def _process_queue(self):
+        """Process the evaluation queue in a separate thread."""
+        # Create a new database connection for this thread
+        thread_db = sqlite3.connect(self.db_path)
+        thread_db.row_factory = sqlite3.Row
+        while self.is_processing:
+            try:
+                # Get the next evaluation from the database using thread-local connection
+                cursor = thread_db.cursor()
+                cursor.execute("""
+                    SELECT e.id as evaluation_id, e.model_id, e.benchmark_id, m.hf_model_id, b.dataset_id
+                    FROM queue q
+                    JOIN evaluations e ON q.evaluation_id = e.id
+                    JOIN models m ON e.model_id = m.id
+                    JOIN benchmarks b ON e.benchmark_id = b.id
+                    WHERE e.status = 'pending'
+                    ORDER BY q.priority DESC, q.added_at ASC
+                    LIMIT 1
+                """)
+                row = cursor.fetchone()
+                if row:
+                    next_eval = dict(row)
+                    # Update status to running
+                    cursor.execute("""
+                        UPDATE evaluations
+                        SET status = 'running', started_at = datetime('now')
+                        WHERE id = ?
+                    """, (next_eval['evaluation_id'],))
+                    thread_db.commit()
+                    # Set current evaluation and reset progress
+                    with self.progress_lock:
+                        self.current_evaluation = next_eval
+                        self.progress = 0
+                    try:
+                        # Run the evaluation
+                        results = self._run_evaluation(
+                            next_eval['hf_model_id'],
+                            next_eval['dataset_id']
+                        )
+                        # Calculate overall score
+                        score = self._calculate_overall_score(results)
+                        # Update status to completed with results
+                        cursor.execute("""
+                            UPDATE evaluations
+                            SET status = 'completed',
+                                completed_at = datetime('now'),
+                                results = ?,
+                                score = ?
+                            WHERE id = ?
+                        """, (json.dumps(results), score, next_eval['evaluation_id']))
+                        thread_db.commit()
+                    except Exception as e:
+                        print(f"Evaluation error: {e}")
+                        # Update status to failed
+                        cursor.execute("""
+                            UPDATE evaluations
+                            SET status = 'failed', completed_at = datetime('now')
+                            WHERE id = ?
+                        """, (next_eval['evaluation_id'],))
+                        thread_db.commit()
+                    # Clear current evaluation
+                    with self.progress_lock:
+                        self.current_evaluation = None
+                        self.progress = 0
+                else:
+                    # No evaluations in queue, sleep for a bit
+                    time.sleep(5)
+            except Exception as e:
+                print(f"Queue processing error: {e}")
+                time.sleep(5)
+        # Close the thread-local database connection
+        thread_db.close()
+    def _run_evaluation(self, model_id, dataset_id):
+        """Run an evaluation for a model on a benchmark.
+        Args:
+            model_id: HuggingFace model ID
+            dataset_id: HuggingFace dataset ID (with optional config)
+        Returns:
+            dict: Evaluation results
+        """
+        # Update progress
+        with self.progress_lock:
+            self.progress = 5  # Starting evaluation
+        # Parse dataset ID and config
+        if ":" in dataset_id:
+            dataset_id, config = dataset_id.split(":", 1)
+        else:
+            config = None
+        # Update progress
+        with self.progress_lock:
+            self.progress = 10  # Loading dataset
+        # Load the dataset
+        if config:
+            dataset = load_dataset(dataset_id, config, split="test")
+        else:
+            dataset = load_dataset(dataset_id, split="test")
+        # Update progress
+        with self.progress_lock:
+            self.progress = 20  # Loading model
+        # Load the model (CPU only)
+        device = "cpu"
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map=device,
+            torch_dtype=torch.float32,  # Use float32 for CPU
+            low_cpu_mem_usage=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        # Update progress
+        with self.progress_lock:
+            self.progress = 30  # Determining task type
+        # Determine task type based on dataset features
+        task_type = self._determine_task_type(dataset)
+        # Update progress
+        with self.progress_lock:
+            self.progress = 40  # Starting evaluation
+        # Run appropriate evaluation based on task type
+        if task_type == "text-generation":
+            results = self._evaluate_text_generation(model, tokenizer, dataset)
+        elif task_type == "question-answering":
+            results = self._evaluate_question_answering(model, tokenizer, dataset)
+        elif task_type == "classification":
+            results = self._evaluate_classification(model, tokenizer, dataset)
+        elif task_type == "code-generation":
+            results = self._evaluate_code_generation(model, tokenizer, dataset)
+        else:
+            # Default to general evaluation
+            results = self._evaluate_general(model, tokenizer, dataset)
+        # Update progress
+        with self.progress_lock:
+            self.progress = 95  # Cleaning up
+        # Clean up to free memory
+        del model
+        del tokenizer
+        torch.cuda.empty_cache()
+        # Update progress
+        with self.progress_lock:
+            self.progress = 100  # Completed
+        return results
+    def get_current_progress(self):
+        """Get the current evaluation progress.
+        Returns:
+            tuple: (current_evaluation, progress_percentage)
+        """
+        with self.progress_lock:
+            return self.current_evaluation, self.progress
+    def _determine_task_type(self, dataset):
+        """Determine the task type based on dataset features.
+        Args:
+            dataset: HuggingFace dataset
+        Returns:
+            str: Task type
+        """
+        features = dataset.features
+        # Check for common feature patterns
+        if "question" in features and "answer" in features:
+            return "question-answering"
+        elif "code" in features or "solution" in features:
+            return "code-generation"
+        elif "label" in features or "class" in features:
+            return "classification"
+        elif "input" in features and "output" in features:
+            return "text-generation"
+        else:
+            return "general"
+    def _evaluate_text_generation(self, model, tokenizer, dataset):
+        """Evaluate a model on text generation tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up generation pipeline
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation (to keep runtime reasonable)
+        if len(dataset) > 100:
+            dataset = dataset.select(range(100))
+        # Track metrics
+        correct = 0
+        total = 0
+        generated_texts = []
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            input_text = example.get("input", example.get("prompt", ""))
+            expected_output = example.get("output", example.get("target", ""))
+            if not input_text or not expected_output:
+                continue
+            # Generate text
+            generated = generator(
+                input_text,
+                max_length=100,
+                num_return_sequences=1
+            )
+            generated_text = generated[0]["generated_text"]
+            generated_texts.append(generated_text)
+            # Simple exact match check
+            if expected_output.strip() in generated_text:
+                correct += 1
+            total += 1
+        # Calculate metrics
+        accuracy = correct / total if total > 0 else 0
+        return {
+            "accuracy": accuracy,
+            "samples_evaluated": total,
+            "generated_samples": generated_texts[:5]  # Include a few samples
+        }
+    def _evaluate_question_answering(self, model, tokenizer, dataset):
+        """Evaluate a model on question answering tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up QA pipeline
+        qa_pipeline = pipeline(
+            "question-answering",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 100:
+            dataset = dataset.select(range(100))
+        # Track metrics
+        exact_matches = 0
+        f1_scores = []
+        total = 0
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            question = example.get("question", "")
+            context = example.get("context", "")
+            answer = example.get("answer", "")
+            if not question or not answer:
+                continue
+            # Get model prediction
+            if context:
+                result = qa_pipeline(question=question, context=context)
+            else:
+                # If no context provided, use the question as context
+                result = qa_pipeline(question=question, context=question)
+            predicted_answer = result["answer"]
+            # Calculate exact match
+            if predicted_answer.strip() == answer.strip():
+                exact_matches += 1
+            # Calculate F1 score
+            f1 = self._calculate_f1(answer, predicted_answer)
+            f1_scores.append(f1)
+            total += 1
+        # Calculate metrics
+        exact_match_accuracy = exact_matches / total if total > 0 else 0
+        avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
+        return {
+            "exact_match": exact_match_accuracy,
+            "f1": avg_f1,
+            "samples_evaluated": total
+        }
+    def _evaluate_classification(self, model, tokenizer, dataset):
+        """Evaluate a model on classification tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up classification pipeline
+        classifier = pipeline(
+            "text-classification",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 100:
+            dataset = dataset.select(range(100))
+        # Track metrics
+        correct = 0
+        total = 0
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            text = example.get("text", example.get("sentence", ""))
+            label = str(example.get("label", example.get("class", "")))
+            if not text or not label:
+                continue
+            # Get model prediction
+            result = classifier(text)
+            predicted_label = result[0]["label"]
+            # Check if correct
+            if str(predicted_label) == label:
+                correct += 1
+            total += 1
+        # Calculate metrics
+        accuracy = correct / total if total > 0 else 0
+        return {
+            "accuracy": accuracy,
+            "samples_evaluated": total
+        }
+    def _evaluate_code_generation(self, model, tokenizer, dataset):
+        """Evaluate a model on code generation tasks.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up generation pipeline
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 50:  # Smaller sample for code tasks
+            dataset = dataset.select(range(50))
+        # Track metrics
+        exact_matches = 0
+        functional_matches = 0
+        total = 0
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            prompt = example.get("prompt", example.get("input", ""))
+            solution = example.get("solution", example.get("output", ""))
+            if not prompt or not solution:
+                continue
+            # Generate code
+            generated = generator(
+                prompt,
+                max_length=200,
+                num_return_sequences=1
+            )
+            generated_code = generated[0]["generated_text"]
+            # Extract code from generated text (remove prompt)
+            if prompt in generated_code:
+                generated_code = generated_code[len(prompt):].strip()
+            # Check exact match
+            if generated_code.strip() == solution.strip():
+                exact_matches += 1
+                functional_matches += 1
+            else:
+                # We would ideally check functional correctness here
+                # but that requires executing code which is complex and potentially unsafe
+                # For now, we'll use a simple heuristic
+                if len(generated_code) > 0 and any(keyword in generated_code for keyword in ["def ", "function", "return", "class"]):
+                    functional_matches += 0.5  # Partial credit
+            total += 1
+        # Calculate metrics
+        exact_match_rate = exact_matches / total if total > 0 else 0
+        functional_correctness = functional_matches / total if total > 0 else 0
+        return {
+            "exact_match": exact_match_rate,
+            "functional_correctness": functional_correctness,
+            "samples_evaluated": total
+        }
+    def _evaluate_general(self, model, tokenizer, dataset):
+        """General evaluation for any dataset type.
+        Args:
+            model: HuggingFace model
+            tokenizer: HuggingFace tokenizer
+            dataset: HuggingFace dataset
+        Returns:
+            dict: Evaluation results
+        """
+        # Set up generation pipeline
+        generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device="cpu"
+        )
+        # Sample a subset for evaluation
+        if len(dataset) > 50:
+            dataset = dataset.select(range(50))
+        # Find input and output fields
+        features = dataset.features
+        input_field = None
+        output_field = None
+        for field in features:
+            if field.lower() in ["input", "prompt", "question", "text"]:
+                input_field = field
+            elif field.lower() in ["output", "target", "answer", "response"]:
+                output_field = field
+        if not input_field:
+            # Just use the first string field as input
+            for field in features:
+                if isinstance(features[field], (str, list)):
+                    input_field = field
+                    break
+        # Track metrics
+        total = 0
+        generated_texts = []
+        # Process each example
+        for i, example in enumerate(dataset):
+            # Update progress based on completion percentage
+            with self.progress_lock:
+                self.progress = 40 + int((i / len(dataset)) * 50)
+            if input_field and input_field in example:
+                input_text = str(example[input_field])
+                # Generate text
+                generated = generator(
+                    input_text,
+                    max_length=100,
+                    num_return_sequences=1
+                )
+                generated_text = generated[0]["generated_text"]
+                generated_texts.append({
+                    "input": input_text,
+                    "output": generated_text,
+                    "expected": str(example[output_field]) if output_field and output_field in example else "N/A"
+                })
+                total += 1
+        return {
+            "samples_evaluated": total,
+            "generated_samples": generated_texts[:5]  # Include a few samples
+        }
+    def _calculate_f1(self, answer, prediction):
+        """Calculate F1 score between answer and prediction.
+        Args:
+            answer: Ground truth answer
+            prediction: Model prediction
+        Returns:
+            float: F1 score
+        """
+        # Tokenize
+        answer_tokens = answer.lower().split()
+        prediction_tokens = prediction.lower().split()
+        # Calculate precision and recall
+        common_tokens = set(answer_tokens) & set(prediction_tokens)
+        if not common_tokens:
+            return 0.0
+        precision = len(common_tokens) / len(prediction_tokens)
+        recall = len(common_tokens) / len(answer_tokens)
+        # Calculate F1
+        if precision + recall == 0:
+            return 0.0
+        f1 = 2 * precision * recall / (precision + recall)
+        return f1
+    def _calculate_overall_score(self, results):
+        """Calculate an overall score from evaluation results.
+        Args:
+            results: Evaluation results dictionary
+        Returns:
+            float: Overall score between 0 and 100
+        """
+        score = 0.0
+        # Check for common metrics and weight them
+        if "accuracy" in results:
+            score += results["accuracy"] * 100
+        if "exact_match" in results:
+            score += results["exact_match"] * 100
+        if "f1" in results:
+            score += results["f1"] * 100
+        if "functional_correctness" in results:
+            score += results["functional_correctness"] * 100
+        # If multiple metrics were found, average them
+        num_metrics = sum(1 for metric in ["accuracy", "exact_match", "f1", "functional_correctness"] if metric in results)
+        if num_metrics > 0:
+            score /= num_metrics
+        else:
+            # Default score if no metrics available
+            score = 50.0
+        return score
+    def submit_evaluation(self, model_id, benchmark_id, user_id, priority=0):
+        """Submit a model for evaluation on a benchmark.
+        Args:
+            model_id: Model ID in the database
+            benchmark_id: Benchmark ID in the database
+            user_id: User ID submitting the evaluation
+            priority: Queue priority (higher = higher priority)
+        Returns:
+            int: Evaluation ID if successful, None otherwise
+        """
+        # Check if user can submit today
+        if not self.auth_manager.can_submit_benchmark(user_id):
+            return None, "Daily submission limit reached. Try again tomorrow."
+        try:
+            # Add evaluation to database and queue
+            evaluation_id = self.db_manager.add_evaluation(
+                model_id=model_id,
+                benchmark_id=benchmark_id,
+                priority=priority
+            )
+            # Update user's last submission date
+            self.auth_manager.update_submission_date(user_id)
+            # Make sure worker is running
+            self.start_worker()
+            return evaluation_id, "Evaluation submitted successfully."
+        except Exception as e:
+            print(f"Submit evaluation error: {e}")
+            return None, f"Failed to submit evaluation: {str(e)}"
+    def get_queue_status(self):
+        """Get the current status of the evaluation queue.
+        Returns:
+            dict: Queue status information
+        """
+        try:
+            # Get evaluations from database
+            pending_evals = self.db_manager.get_evaluation_results(status="pending")
+            running_evals = self.db_manager.get_evaluation_results(status="running")
+            completed_evals = self.db_manager.get_evaluation_results(status="completed")
+            failed_evals = self.db_manager.get_evaluation_results(status="failed")
+            # Get current evaluation progress
+            current_eval, progress = self.get_current_progress()
+            return {
+                "pending": len(pending_evals),
+                "running": len(running_evals),
+                "completed": len(completed_evals),
+                "failed": len(failed_evals),
+                "is_processing": self.is_processing,
+                "current_evaluation": current_eval,
+                "progress": progress
+            }
+        except Exception as e:
+            print(f"Queue status error: {e}")
+            return {
+                "pending": 0,
+                "running": 0,
+                "completed": 0,
+                "failed": 0,
+                "is_processing": self.is_processing,
+                "current_evaluation": None,
+                "progress": 0,
+                "error": str(e)
+            }
+# Model submission UI components
+def create_model_submission_ui(evaluation_queue, auth_manager, db_manager):
+    """Create the model submission UI components.
+    Args:
+        evaluation_queue: Evaluation queue instance
+        auth_manager: Authentication manager instance
+        db_manager: Database manager instance
+    Returns:
+        gr.Blocks: Gradio Blocks component with model submission UI
+    """
+    with gr.Blocks() as submission_ui:
+        with gr.Tab("Submit Model"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    model_id_input = gr.Textbox(
+                        placeholder="HuggingFace model ID (e.g., 'gpt2', 'facebook/opt-350m')",
+                        label="Model ID"
+                    )
+                    model_name_input = gr.Textbox(
+                        placeholder="Display name for your model",
+                        label="Model Name"
+                    )
+                    model_description_input = gr.Textbox(
+                        placeholder="Brief description of your model",
+                        label="Description",
+                        lines=3
+                    )
+                    model_parameters_input = gr.Number(
+                        label="Number of Parameters (billions)",
+                        precision=2
+                    )
+                with gr.Column(scale=1):
+                    model_tag_input = gr.Dropdown(
+                        choices=evaluation_queue.model_tags,
+                        label="Model Tag",
+                        info="Select one category that best describes your model"
+                    )
+                    benchmark_dropdown = gr.Dropdown(
+                        label="Benchmark",
+                        info="Select a benchmark to evaluate your model on"
+                    )
+                    refresh_benchmarks_button = gr.Button("Refresh Benchmarks")
+            submit_model_button = gr.Button("Submit for Evaluation")
+            submission_status = gr.Markdown("")
+        with gr.Tab("Evaluation Queue"):
+            refresh_queue_button = gr.Button("Refresh Queue")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    queue_stats = gr.JSON(
+                        label="Queue Statistics"
+                    )
+                with gr.Column(scale=2):
+                    queue_status = gr.Dataframe(
+                        headers=["ID", "Model", "Benchmark", "Status", "Submitted"],
+                        label="Recent Evaluations"
+                    )
+            with gr.Row(visible=True) as progress_container:
+                with gr.Column():
+                    current_eval_info = gr.Markdown("No evaluation currently running")
+                    # Use a simple text display for progress instead of Progress component
+                    progress_display = gr.Markdown("Progress: 0%")
+            # Function to update progress display
+            def update_progress_display():
+                current_eval, progress = evaluation_queue.get_current_progress()
+                if current_eval:
+                    model_info = db_manager.get_model(current_eval['model_id'])
+                    benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
+                    if model_info and benchmark_info:
+                        eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
+                        progress_text = f"Progress: {progress}%"
+                        return eval_info, progress_text
+                return "No evaluation currently running", "Progress: 0%"
+        # Event handlers
+        def refresh_benchmarks_handler():
+            benchmarks = db_manager.get_benchmarks()
+            # Format for dropdown
+            choices = [(b["id"], b["name"]) for b in benchmarks]
+            return gr.update(choices=choices)
+        def submit_model_handler(model_id, model_name, model_description, model_parameters, model_tag, benchmark_id, request: gr.Request):
+            # Check if user is logged in
+            user = auth_manager.check_login(request)
+            if not user:
+                return "Please log in to submit a model."
+            if not model_id or not model_name or not model_tag or not benchmark_id:
+                return "Please fill in all required fields."
+            try:
+                # Add model to database
+                model_db_id = db_manager.add_model(
+                    name=model_name,
+                    hf_model_id=model_id,
+                    user_id=user["id"],
+                    tag=model_tag,
+                    parameters=str(model_parameters) if model_parameters else None,
+                    description=model_description
+                )
+                if not model_db_id:
+                    return "Failed to add model to database."
+                # Submit for evaluation
+                eval_id, message = evaluation_queue.submit_evaluation(
+                    model_id=model_db_id,
+                    benchmark_id=benchmark_id,
+                    user_id=user["id"]
+                )
+                if eval_id:
+                    return f"Model submitted successfully. Evaluation ID: {eval_id}"
+                else:
+                    return message
+            except Exception as e:
+                return f"Error submitting model: {str(e)}"
+        def refresh_queue_handler():
+            # Get queue statistics
+            stats = evaluation_queue.get_queue_status()
+            # Get recent evaluations
+            evals = db_manager.get_evaluation_results(limit=20)
+            # Format for dataframe
+            eval_data = []
+            for eval in evals:
+                eval_data.append([
+                    eval["id"],
+                    eval["model_name"],
+                    eval["benchmark_name"],
+                    eval["status"],
+                    eval["submitted_at"]
+                ])
+            # Also update progress display
+            current_eval, progress = evaluation_queue.get_current_progress()
+            if current_eval:
+                model_info = db_manager.get_model(current_eval['model_id'])
+                benchmark_info = db_manager.get_benchmark(current_eval['benchmark_id'])
+                if model_info and benchmark_info:
+                    eval_info = f"**Currently Evaluating:** {model_info['name']} on {benchmark_info['name']}"
+                    progress_text = f"Progress: {progress}%"
+                    return stats, eval_data, eval_info, progress_text
+            return stats, eval_data, "No evaluation currently running", "Progress: 0%"
+        # Connect event handlers
+        refresh_benchmarks_button.click(
+            fn=refresh_benchmarks_handler,
+            inputs=[],
+            outputs=[benchmark_dropdown]
+        )
+        submit_model_button.click(
+            fn=submit_model_handler,
+            inputs=[
+                model_id_input,
+                model_name_input,
+                model_description_input,
+                model_parameters_input,
+                model_tag_input,
+                benchmark_dropdown
+            ],
+            outputs=[submission_status]
+        )
+        refresh_queue_button.click(
+            fn=refresh_queue_handler,
+            inputs=[],
+            outputs=[queue_stats, queue_status, current_eval_info, progress_display]
+        )
+        # Initialize on load
+        submission_ui.load(
+            fn=refresh_benchmarks_handler,
+            inputs=[],
+            outputs=[benchmark_dropdown]
+        )
+        submission_ui.load(
+            fn=refresh_queue_handler,
+            inputs=[],
+            outputs=[queue_stats, queue_status, current_eval_info, progress_display]
+        )
+        # Manual refresh button with instructions
+        gr.Markdown("""
+        **Note:** Click the 'Refresh Queue' button periodically to update the progress display.
+        """)
+    return submission_ui

leaderboard.py ADDED Viewed

	@@ -0,0 +1,381 @@

+"""
+Leaderboard module for Dynamic Highscores system.
+This module implements the unified leaderboard with tag-based filtering
+for displaying all evaluated models.
+"""
+import os
+import json
+import pandas as pd
+import gradio as gr
+import plotly.express as px
+import plotly.graph_objects as go
+class Leaderboard:
+    """Manages the unified leaderboard with filtering capabilities."""
+    def __init__(self, db_manager):
+        """Initialize the leaderboard manager.
+        Args:
+            db_manager: Database manager instance
+        """
+        self.db_manager = db_manager
+        self.model_tags = ["All", "Merge", "Agent", "Reasoning", "Coding", "General", "Specialized", "Instruction", "Chat"]
+        # Define color scheme for tags
+        self.tag_colors = {
+            "Merge": "#FF6B6B",
+            "Agent": "#4ECDC4",
+            "Reasoning": "#FFD166",
+            "Coding": "#6B5B95",
+            "General": "#88D8B0",
+            "Specialized": "#FF8C42",
+            "Instruction": "#5D9CEC",
+            "Chat": "#AC92EB"
+        }
+    def get_leaderboard_data(self, tag=None, benchmark_id=None):
+        """Get leaderboard data, optionally filtered by tag or benchmark.
+        Args:
+            tag: Model tag to filter by (None for all)
+            benchmark_id: Benchmark ID to filter by (None for all)
+        Returns:
+            pd.DataFrame: Leaderboard data
+        """
+        # Get evaluation results from database
+        if tag and tag != "All":
+            df = self.db_manager.get_leaderboard_df(tag=tag)
+        else:
+            df = self.db_manager.get_leaderboard_df()
+        # Filter by benchmark if specified
+        if benchmark_id and not df.empty:
+            df = df[df['benchmark_id'] == benchmark_id]
+        return df
+    def format_leaderboard_for_display(self, df):
+        """Format leaderboard data for display.
+        Args:
+            df: Leaderboard DataFrame
+        Returns:
+            pd.DataFrame: Formatted leaderboard for display
+        """
+        if df.empty:
+            return pd.DataFrame()
+        # Select and rename columns for display
+        display_df = df[['model_name', 'benchmark_name', 'tag', 'score', 'completed_at']].copy()
+        display_df.columns = ['Model', 'Benchmark', 'Tag', 'Score', 'Completed']
+        # Round score to 2 decimal places
+        display_df['Score'] = display_df['Score'].round(2)
+        # Sort by score (descending)
+        display_df = display_df.sort_values('Score', ascending=False)
+        return display_df
+    def create_performance_chart(self, df, chart_type="bar"):
+        """Create a performance chart from leaderboard data.
+        Args:
+            df: Leaderboard DataFrame
+            chart_type: Type of chart to create ("bar" or "scatter")
+        Returns:
+            plotly.graph_objects.Figure: Performance chart
+        """
+        if df.empty:
+            # Return empty figure
+            fig = go.Figure()
+            fig.update_layout(
+                title="No data available",
+                xaxis_title="Model",
+                yaxis_title="Score"
+            )
+            return fig
+        # Prepare data for visualization
+        plot_df = df[['model_name', 'benchmark_name', 'tag', 'score']].copy()
+        plot_df.columns = ['Model', 'Benchmark', 'Tag', 'Score']
+        # Create chart based on type
+        if chart_type == "scatter":
+            fig = px.scatter(
+                plot_df,
+                x="Model",
+                y="Score",
+                color="Tag",
+                symbol="Benchmark",
+                size="Score",
+                hover_data=["Model", "Benchmark", "Score"],
+                color_discrete_map=self.tag_colors
+            )
+        else:  # Default to bar chart
+            fig = px.bar(
+                plot_df,
+                x="Model",
+                y="Score",
+                color="Tag",
+                barmode="group",
+                hover_data=["Model", "Benchmark", "Score"],
+                color_discrete_map=self.tag_colors
+            )
+        # Customize layout
+        fig.update_layout(
+            title="Model Performance Comparison",
+            xaxis_title="Model",
+            yaxis_title="Score",
+            legend_title="Tag",
+            font=dict(size=12)
+        )
+        return fig
+    def create_tag_distribution_chart(self, df):
+        """Create a chart showing distribution of models by tag.
+        Args:
+            df: Leaderboard DataFrame
+        Returns:
+            plotly.graph_objects.Figure: Tag distribution chart
+        """
+        if df.empty:
+            # Return empty figure
+            fig = go.Figure()
+            fig.update_layout(
+                title="No data available",
+                xaxis_title="Tag",
+                yaxis_title="Count"
+            )
+            return fig
+        # Count models by tag
+        tag_counts = df['tag'].value_counts().reset_index()
+        tag_counts.columns = ['Tag', 'Count']
+        # Create pie chart
+        fig = px.pie(
+            tag_counts,
+            names='Tag',
+            values='Count',
+            title='Model Distribution by Tag',
+            color='Tag',
+            color_discrete_map=self.tag_colors
+        )
+        # Customize layout
+        fig.update_layout(
+            font=dict(size=12)
+        )
+        return fig
+    def create_benchmark_comparison_chart(self, df):
+        """Create a chart comparing performance across benchmarks.
+        Args:
+            df: Leaderboard DataFrame
+        Returns:
+            plotly.graph_objects.Figure: Benchmark comparison chart
+        """
+        if df.empty:
+            # Return empty figure
+            fig = go.Figure()
+            fig.update_layout(
+                title="No data available",
+                xaxis_title="Benchmark",
+                yaxis_title="Average Score"
+            )
+            return fig
+        # Calculate average score by benchmark
+        benchmark_avg = df.groupby('benchmark_name')['score'].mean().reset_index()
+        benchmark_avg.columns = ['Benchmark', 'Average Score']
+        # Create bar chart
+        fig = px.bar(
+            benchmark_avg,
+            x='Benchmark',
+            y='Average Score',
+            title='Average Performance by Benchmark',
+            color='Benchmark'
+        )
+        # Customize layout
+        fig.update_layout(
+            xaxis_title="Benchmark",
+            yaxis_title="Average Score",
+            font=dict(size=12)
+        )
+        return fig
+# Leaderboard UI components
+def create_leaderboard_ui(leaderboard, db_manager):
+    """Create the leaderboard UI components.
+    Args:
+        leaderboard: Leaderboard instance
+        db_manager: Database manager instance
+    Returns:
+        gr.Blocks: Gradio Blocks component with leaderboard UI
+    """
+    with gr.Blocks() as leaderboard_ui:
+        gr.Markdown("# Dynamic Highscores Leaderboard")
+        with gr.Row():
+            with gr.Column(scale=1):
+                tag_filter = gr.Dropdown(
+                    choices=leaderboard.model_tags,
+                    value="All",
+                    label="Filter by Tag"
+                )
+                benchmark_filter = gr.Dropdown(
+                    choices=[("all", "All Benchmarks")],
+                    value="all",
+                    label="Filter by Benchmark"
+                )
+                refresh_button = gr.Button("Refresh Leaderboard")
+            with gr.Column(scale=2):
+                chart_type = gr.Radio(
+                    choices=["bar", "scatter"],
+                    value="bar",
+                    label="Chart Type"
+                )
+                view_type = gr.Radio(
+                    choices=["Table", "Chart", "Dashboard"],
+                    value="Table",
+                    label="View Type"
+                )
+        # Table view
+        leaderboard_table = gr.Dataframe(
+            headers=["Model", "Benchmark", "Tag", "Score", "Completed"],
+            label="Leaderboard",
+            visible=True
+        )
+        # Chart view
+        with gr.Row(visible=False) as chart_view:
+            performance_chart = gr.Plot(label="Performance Chart")
+        # Dashboard view
+        with gr.Row(visible=False) as dashboard_view:
+            with gr.Column(scale=2):
+                dashboard_performance_chart = gr.Plot(label="Performance Comparison")
+            with gr.Column(scale=1):
+                with gr.Row():
+                    tag_distribution_chart = gr.Plot(label="Model Distribution")
+                with gr.Row():
+                    benchmark_comparison_chart = gr.Plot(label="Benchmark Comparison")
+        # Event handlers
+        def refresh_benchmarks():
+            benchmarks = db_manager.get_benchmarks()
+            # Format for dropdown
+            choices = [("all", "All Benchmarks")]
+            choices.extend([(str(b["id"]), b["name"]) for b in benchmarks])
+            return gr.update(choices=choices)
+        def update_leaderboard(tag, benchmark_id, chart_type_val, view_type_val):
+            # Get leaderboard data
+            if benchmark_id == "all":
+                benchmark_id = None
+            else:
+                benchmark_id = int(benchmark_id)
+            df = leaderboard.get_leaderboard_data(tag=tag, benchmark_id=benchmark_id)
+            # Format for display
+            display_df = leaderboard.format_leaderboard_for_display(df)
+            # Create charts
+            perf_chart = leaderboard.create_performance_chart(df, chart_type=chart_type_val)
+            tag_chart = leaderboard.create_tag_distribution_chart(df)
+            benchmark_chart = leaderboard.create_benchmark_comparison_chart(df)
+            # Update visibility based on view type
+            table_visible = view_type_val == "Table"
+            chart_visible = view_type_val == "Chart"
+            dashboard_visible = view_type_val == "Dashboard"
+            return (
+                display_df,
+                perf_chart,
+                perf_chart,  # Same chart for both views
+                tag_chart,
+                benchmark_chart,
+                gr.update(visible=table_visible),
+                gr.update(visible=chart_visible),
+                gr.update(visible=dashboard_visible)
+            )
+        # Connect event handlers
+        refresh_button.click(
+            fn=lambda tag, benchmark, chart_t, view_t: update_leaderboard(tag, benchmark, chart_t, view_t),
+            inputs=[tag_filter, benchmark_filter, chart_type, view_type],
+            outputs=[
+                leaderboard_table,
+                performance_chart,
+                dashboard_performance_chart,
+                tag_distribution_chart,
+                benchmark_comparison_chart,
+                leaderboard_table,
+                chart_view,
+                dashboard_view
+            ]
+        )
+        view_type.change(
+            fn=lambda view_t: (
+                gr.update(visible=view_t == "Table"),
+                gr.update(visible=view_t == "Chart"),
+                gr.update(visible=view_t == "Dashboard")
+            ),
+            inputs=[view_type],
+            outputs=[leaderboard_table, chart_view, dashboard_view]
+        )
+        # Initialize on load
+        leaderboard_ui.load(
+            fn=refresh_benchmarks,
+            inputs=[],
+            outputs=[benchmark_filter]
+        )
+        leaderboard_ui.load(
+            fn=lambda: update_leaderboard("All", "all", "bar", "Table"),
+            inputs=[],
+            outputs=[
+                leaderboard_table,
+                performance_chart,
+                dashboard_performance_chart,
+                tag_distribution_chart,
+                benchmark_comparison_chart,
+                leaderboard_table,
+                chart_view,
+                dashboard_view
+            ]
+        )
+    return leaderboard_ui