Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

File size: 32,917 Bytes

import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, get_dataset_config_names
import torch
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import traceback # Import traceback for detailed error logging
import spaces # Import the spaces library

# Cache to avoid reloading the model
model_cache = {}

HF_TOKEN = os.environ.get("HF_TOKEN")

# --- Constants for Benchmarks ---
MMLU_DATASET = "cais/mmlu"
MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"

def get_all_benchmark_options():
    """
    Dynamically fetches all available subjects for MMLU and MMLU-Pro.
    Returns a dictionary mapping benchmark dataset IDs to their subjects,
    and a flattened list suitable for a Gradio dropdown.
    """
    all_options = {}
    gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly

    # Get subjects for MMLU
    try:
        mmlu_subjects = get_dataset_config_names(MMLU_DATASET, token=HF_TOKEN)
        all_options[MMLU_DATASET] = ["ALL"] + mmlu_subjects
    except Exception as e:
        print(f"Warning: Could not load MMLU dataset configs. Error: {e}")
        all_options[MMLU_DATASET] = []

    # Get subjects for MMLU-Pro
    try:
        mmlu_pro_subjects = get_dataset_config_names(MMLU_PRO_DATASET, token=HF_TOKEN)
        all_options[MMLU_PRO_DATASET] = ["ALL"] + mmlu_pro_subjects
    except Exception as e:
        print(f"Warning: Could not load MMLU-Pro dataset configs. It might not be accessible or available. Error: {e}")
        all_options[MMLU_PRO_DATASET] = []

    # Flattened list for the initial state of the subject dropdown (e.g., MMLU subjects)
    if MMLU_DATASET in all_options:
        gr_dropdown_options.extend(all_options[MMLU_DATASET])


    return all_options, gr_dropdown_options

# Initialize these once globally when the app starts
ALL_BENCHMARK_SUBJECTS, INITIAL_GRADIO_DROPDOWN_OPTIONS = get_all_benchmark_options()

@spaces.GPU() # Decorator to ensure this function runs on GPU if available
def load_model(model_id):
    """
    Loads a Hugging Face model and its tokenizer, then creates a text-generation pipeline.
    Uses a cache to avoid re-loading if the model is already in memory.
    Provides Gradio Info/Error messages for user feedback.
    Raises an exception if model loading fails.
    """
    gr.Info(f"Attempting to load model: {model_id}...")
    if model_id in model_cache:
        gr.Info(f"Model '{model_id}' already loaded from cache.")
        return model_cache[model_id]
    try:
        # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
        tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            token=HF_TOKEN,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            trust_remote_code=True
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        # Create a text-generation pipeline
        generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
        
        # Cache the loaded generator
        model_cache[model_id] = generator
        gr.Info(f"Model '{model_id}' loaded successfully.")
        return generator
    except Exception as e:
        # Re-raise the exception to be caught by the outer run_evaluation try-except
        raise ValueError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}")


def format_prompt(item):
    """
    Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM.
    The prompt is designed for the model to output a single letter answer (A, B, C, D).
    """
    prompt = f"""{item['question']}
A. {item['choices'][0]}
B. {item['choices'][1]}
C. {item['choices'][2]}
D. {item['choices'][3]}
Answer:"""
    return prompt, item['answer'] # Returns the prompt string and the correct choice index (0-3)

def extract_choice_letter(output):
    """
    Extracts the most likely choice letter (A, B, C, D) from the model's generated output.
    It prioritizes an exact match after "Answer:", then looks for any single capital letter.
    """
    # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
    match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE)
    if match:
        return match.group(1).upper() # Ensure it's uppercase

    # Fallback: look for a single capital letter A-D anywhere in the output
    match = re.search(r"\b([ABCD])\b", output.strip())
    if match:
        return match.group(1)
    
    return None # Return None if no valid choice letter is found

def get_choice_letter(index):
    """Converts a numerical choice index (0-3) to a capital letter (A-D)."""
    if 0 <= index <= 3:
        return chr(ord('A') + index)
    return None # Return None for invalid indices

def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
    """
    Evaluates a given model generator on a specific subject from a specified dataset.
    
    Args:
        generator: The Hugging Face pipeline for text generation.
        dataset_id (str): The ID of the dataset (e.g., "cais/mmlu", "cais/mmlu_pro").
        subject (str): The specific subject/config name within the dataset.
        sample_count (int): The maximum number of samples to evaluate.
        progress (gr.Progress): Gradio progress tracker.

    Returns:
        tuple: (accuracy, list_of_detailed_results)
    Raises:
        Exception: If dataset loading fails.
    """
    gr.Info(f"Loading dataset: {dataset_id} - {subject}...")
    try:
        # Load the "test" split of the dataset
        dataset = load_dataset(dataset_id, subject, token=HF_TOKEN)["test"]
    except Exception as e:
        # Re-raise the exception to be caught by the outer run_evaluation try-except
        raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")

    # Limit the number of samples and shuffle for consistent evaluation across runs
    num_samples_to_evaluate = min(sample_count, len(dataset))
    dataset = dataset.shuffle(seed=42).select(range(num_samples_to_evaluate))

    correct_count = 0
    subject_results = []

    # Iterate through the selected samples with a progress bar
    for i, item in enumerate(progress.tqdm(dataset, desc=f"Processing {subject} samples")):
        prompt, answer_idx = format_prompt(item)
        expected_letter = get_choice_letter(answer_idx)

        # Generate only 1 new token for the answer (A, B, C, D)
        # do_sample=False ensures deterministic output for a given prompt (greedy decoding)
        output_raw = generator(prompt, max_new_tokens=1, do_sample=False)[0]["generated_text"]
        
        # Check for potential reasoning model output
        is_reasoning_model_output = '<' in output_raw or re.search(r"\b(because|therefore|thus|reasoning)\b", output_raw, re.IGNORECASE) is not None
        
        # Extract the predicted letter from the model's raw output
        predicted_letter = extract_choice_letter(output_raw)

        is_correct = (predicted_letter == expected_letter)
        correct_count += is_correct
        
        # Store detailed results for logging and display
        subject_results.append({
            "question": item['question'],
            "choices": item['choices'],
            "model_raw_output": output_raw.strip(),
            "expected_answer_letter": expected_letter,
            "predicted_answer_letter": predicted_letter,
            "is_correct": is_correct,
            "is_reasoning_model_output": is_reasoning_model_output # Store the flag
        })
    
    # Calculate accuracy for the current subject
    accuracy = (correct_count / len(dataset)) * 100 if len(dataset) > 0 else 0
    return accuracy, subject_results

@spaces.GPU() # Decorator to ensure this function runs on GPU if available
def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress()):
    """
    Main function to orchestrate the evaluation process.
    Handles single subject or 'ALL' subjects evaluation for MMLU/MMLU-Pro.
    Returns Gradio.update objects to control UI component visibility and content.
    """
    gr.Info("Starting evaluation...")
    if not model_id:
        gr.Warning("Please enter a Hugging Face Model ID before running the evaluation.")
        # Return updates to hide logs/debug and show empty results
        return "", gr.update(value="", visible=False), gr.update(visible=False), \
               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)
    
    dataset_id_map = {
        "MMLU": MMLU_DATASET,
        "MMLU-Pro": MMLU_PRO_DATASET
    }
    current_dataset_id = dataset_id_map.get(benchmark_category)

    if not current_dataset_id:
        gr.Error(f"Unknown benchmark category selected: {benchmark_category}. This should not happen.")
        return "", gr.update(value="", visible=False), gr.update(visible=False), \
               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

    try:
        generator = load_model(model_id) # This function will raise an exception on failure
        
        all_evaluation_results = []
        total_correct_overall = 0
        total_samples_overall = 0
        eval_summary_lines = []

        if subject_name == "ALL":
            subjects_to_evaluate = ALL_BENCHMARK_SUBJECTS.get(current_dataset_id, [])
            if "ALL" in subjects_to_evaluate:
                subjects_to_evaluate.remove("ALL")

            if not subjects_to_evaluate:
                gr.Warning(f"No subjects found to evaluate for '{benchmark_category}'.")
                return "", gr.update(value="", visible=False), gr.update(visible=False), \
                       gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

            for i, sub in enumerate(progress.tqdm(subjects_to_evaluate, desc=f"Evaluating ALL {benchmark_category} subjects")):
                gr.Info(f"Evaluating {benchmark_category} - {sub} ({i+1}/{len(subjects_to_evaluate)})...")
                try:
                    accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, sub, sample_count, progress)
                    all_evaluation_results.extend(subject_details)

                    num_evaluated_samples = len(subject_details)
                    num_correct_in_subject = sum(d['is_correct'] for d in subject_details)

                    total_correct_overall += num_correct_in_subject
                    total_samples_overall += num_evaluated_samples
                    eval_summary_lines.append(f"- {benchmark_category} - {sub}: {accuracy:.2f}% ({num_correct_in_subject}/{num_evaluated_samples} samples)")
                except Exception as e:
                    gr.Error(f"Skipping {benchmark_category} - {sub} due to an error: {e}")
                    eval_summary_lines.append(f"- {benchmark_category} - {sub}: Error during evaluation.")
                    continue

            overall_accuracy = (total_correct_overall / total_samples_overall) * 100 if total_samples_overall > 0 else 0
            score_string = f"Overall Average Accuracy for {benchmark_category}: {overall_accuracy:.2f}% across {total_samples_overall} total samples.\n\n"
            score_string += "Detailed breakdown:\n" + "\n".join(eval_summary_lines)

        else:
            accuracy, subject_details = evaluate_single_subject(generator, current_dataset_id, subject_name, sample_count, progress)
            all_evaluation_results.extend(subject_details)
            overall_accuracy = accuracy
            num_evaluated_samples = len(subject_details)
            score_string = f"Accuracy for {benchmark_category} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."

        # Format detailed results for display in the text box
        formatted_details = "\n\n".join([
            (
                f"### Question:\n{item['question']}\n\n"
                + f"**Choices:**\n" + "\n".join([f"{get_choice_letter(i)}. {c}" for i, c in enumerate(item['choices'])]) + "\n\n"
                + (f"**Note:** Reasoning models are currently not fully supported for single-letter extraction. The original model output followed:\n" if item.get('is_reasoning_model_output') else "")
                + f"**Model Raw Output:** {item['model_raw_output']}\n"
                + f"**Expected Answer:** {item['expected_answer_letter']}\n"
                + f"**Predicted Answer:** {item['predicted_answer_letter']}\n"
                + f"**Correct:** {'Yes' if item['is_correct'] else 'No'}"
            )
            for item in all_evaluation_results
        ])

        # Record the evaluation result to a JSONL file for the leaderboard
        record = {
            "model_id": model_id,
            "benchmark": benchmark_category,
            "subject": subject_name,
            "accuracy": overall_accuracy,
            "sample_count": total_samples_overall if subject_name == "ALL" else len(all_evaluation_results),
            "timestamp": pd.Timestamp.now().isoformat()
        }
        with open("eval.jsonl", "a") as f:
            f.write(json.dumps(record) + "\n")

        gr.Info("Evaluation completed successfully!")
        return score_string, \
               gr.update(value="", visible=False), gr.update(visible=False), \
               gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False)

    except Exception as e:
        error_message = str(e)
        detailed_error_traceback = traceback.format_exc()
        gr.Error("An error occurred during evaluation.")
        
        # Return updates for error state
        return "Error occurred during evaluation. We'll evaluate for you if this persists - please open a community support tab for assistance.", \
               gr.update(value=detailed_error_traceback, visible=True), gr.update(visible=True), \
               gr.update(visible=False), gr.update(visible=False), gr.update(value="", visible=False)

def save_text(text_content):
    """Saves the provided text content to a file and returns the file path for download."""
    if not text_content:
        gr.Warning("No evaluation results to download.")
        return None
    file_path = "evaluation_results.txt"
    try:
        with open(file_path, "w") as f:
            f.write(text_content)
        return file_path
    except Exception as e:
        gr.Error(f"Error saving file: {e}")
        return None

def load_leaderboard(benchmark_filter):
    """
    Loads evaluation data from 'eval.jsonl', computes average accuracy per model for the selected benchmark,
    and prepares data for the leaderboard table.
    """
    try:
        df = pd.read_json("eval.jsonl", lines=True)
        
        # Ensure 'accuracy' is numeric, coerce errors to NaN and drop them
        df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
        df = df.dropna(subset=['accuracy'])

        if df.empty:
            gr.Warning("No valid evaluation data found to populate the leaderboard.")
            return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')

        # Filter data based on the selected benchmark
        df_filtered = df[df['benchmark'] == benchmark_filter]

        if df_filtered.empty:
            gr.Warning(f"No evaluation data for {benchmark_filter} found yet.")
            return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')

        # For the leaderboard, we typically want the average across all subjects within that benchmark.
        # So we group by model_id and take the mean of accuracy.
        df_grouped = df_filtered.groupby("model_id")["accuracy"].mean().reset_index()
        df_grouped.columns = ["Model ID", "Average Accuracy (%)"]
        df_sorted = df_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
        
        return df_sorted.to_dict('records')

    except FileNotFoundError:
        gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
        return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
    except Exception as e:
        gr.Error(f"Error loading leaderboard: {e}")
        traceback.print_exc() # Print full traceback for debugging
        return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')

def update_subject_dropdown_choices(benchmark_category):
    """
    Updates the choices for the subject dropdown based on the selected benchmark category.
    """
    dataset_id_map = {
        "MMLU": MMLU_DATASET,
        "MMLU-Pro": MMLU_PRO_DATASET
    }
    selected_dataset_id = dataset_id_map.get(benchmark_category)
    
    if selected_dataset_id and selected_dataset_id in ALL_BENCHMARK_SUBJECTS:
        new_choices = ALL_BENCHMARK_SUBJECTS[selected_dataset_id]
        # Set default value to "ALL" if available, otherwise the first subject
        default_value = "ALL" if "ALL" in new_choices else (new_choices[0] if new_choices else None)
        return gr.update(choices=new_choices, value=default_value)
    else:
        return gr.update(choices=[], value=None)


# --- Gradio Interface Definition ---
with gr.Blocks(css="""
    /* Import Google Font - Inter */
    @import url('https://fonts.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');

    /* General body and container styling */
    body { 
        font-family: 'Inter', sans-serif; 
        background-color: #eef2f6; /* Lighter background */
        margin: 0; 
        padding: 20px; 
    }
    .gradio-container { 
        max-width: 1200px; 
        margin: 20px auto; 
        padding: 40px; /* Increased padding */
        box-shadow: 0 10px 25px rgba(0,0,0,0.1); /* Softer, larger shadow */
        border-radius: 15px; /* More rounded corners */
        background-color: #ffffff; 
        border: 1px solid #e0e6ed; /* Subtle border */
    }
    
    /* Headings */
    h1 { 
        color: #1a202c; /* Darker, more professional heading color */
        text-align: center; 
        margin-bottom: 30px; 
        font-size: 2.8em; /* Slightly larger H1 */
        font-weight: 700; 
        letter-spacing: -0.03em; 
        text-shadow: 1px 1px 2px rgba(0,0,0,0.05); /* Subtle text shadow */
    }
    h3 { 
        color: #2d3748; 
        font-size: 1.3em; /* Slightly larger H3 */
        margin-bottom: 15px; 
        font-weight: 600; 
    }

    /* Markdown text */
    .markdown-text { 
        text-align: center; 
        color: #4a5568; 
        line-height: 1.7; 
        font-size: 1.05em; 
        margin-bottom: 30px; 
    }
    .markdown-text div { 
        font-size: 1.1em; 
        max-width: 800px; /* Constrain width for readability */
        margin: 0 auto;
    }

    /* Buttons */
    .gr-button { 
        background-color: #2f80ed; /* A vibrant, professional blue */
        color: white; 
        border: none; 
        padding: 14px 30px; /* More padding */
        border-radius: 10px; /* More rounded */
        cursor: pointer; 
        transition: background-color 0.3s ease, transform 0.2s ease, box-shadow 0.2s ease; 
        font-size: 1.15em; /* Slightly larger font */
        font-weight: 600;
        box-shadow: 0 5px 15px rgba(0, 123, 255, 0.2); /* Enhanced shadow for primary button */
        margin: 5px; /* Add some margin for spacing between buttons */
    }
    .gr-button:hover { 
        background-color: #1a6dcd; /* Darker blue on hover */
        transform: translateY(-3px); /* More pronounced lift effect */
        box-shadow: 0 8px 20px rgba(0, 123, 255, 0.3);
    }
    .gr-button:active {
        transform: translateY(0);
        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
    }
    /* Specific button styling for debug/show details */
    #debug-button, #show-details-button {
        background-color: #718096; /* Professional grey */
        box-shadow: 0 3px 10px rgba(113, 128, 150, 0.2);
    }
    #debug-button:hover, #show-details-button:hover {
        background-color: #5d6d81;
        box-shadow: 0 5px 12px rgba(113, 128, 150, 0.3);
    }
    #download-button {
        background-color: #38a169; /* Muted green for download */
        box-shadow: 0 3px 10px rgba(56, 161, 105, 0.2);
    }
    #download-button:hover {
        background-color: #277e50;
        box-shadow: 0 5px 12px rgba(56, 161, 105, 0.3);
    }

    /* Input/Output Boxes (Containers) */
    .gr-box { 
        border: 1px solid #cbd5e0; /* Lighter, subtle border */
        border-radius: 12px; 
        padding: 25px; /* Increased padding */
        margin-bottom: 25px; 
        background-color: #f8fafc; /* Very light background */
        box-shadow: inset 0 2px 5px rgba(0,0,0,0.03); /* Subtle inner shadow */
    }
    /* Specific text output boxes (the content inside the containers) */
    .gr-output-text { 
        white-space: pre-wrap; 
        word-wrap: break-word; 
        background-color: #ffffff; /* White background for readability */
        border: 1px solid #e2e8f0; 
        border-radius: 8px; 
        padding: 18px; /* More padding */
        min-height: 120px; /* Ensure a minimum height */
        box-shadow: 0 2px 8px rgba(0,0,0,0.05); /* Small shadow for depth */
        color: #2d3748; /* Darker text for readability */
        font-size: 0.95em;
        line-height: 1.6;
    }
    /* Specific error output style */
    #error-message-output {
        background-color: #ffe0e6; /* Light red */
        border-color: #ff99aa; /* Slightly darker red border */
        color: #c53030; /* Stronger red text */
        font-weight: 500;
        padding: 20px;
    }


    /* Labels for inputs */
    .gr-textbox label, .gr-dropdown label, .gr-slider label { 
        font-weight: 600; 
        color: #2d3748; /* Darker label text */
        margin-bottom: 10px; 
        display: block; 
        font-size: 1.05em; /* Slightly larger label font */
    }

    /* Tabs styling */
    .gr-tabs-nav button {
        font-weight: 600;
        font-size: 1.1em;
        padding: 12px 25px; /* More padding for tabs */
        border-top-left-radius: 10px;
        border-top-right-radius: 10px;
        background-color: #ebf4f8; /* Light blueish tab background */
        color: #4a5568;
        border: 1px solid #cce0eb; /* Subtle border for tabs */
        border-bottom: none;
        transition: background-color 0.3s ease, color 0.3s ease;
    }
    .gr-tabs-nav button.selected {
        background-color: #ffffff; /* White for selected tab */
        color: #2f80ed; /* Blue for selected text */
        border-color: #2f80ed;
        border-bottom: 1px solid #ffffff; /* Hide bottom border to merge with content */
    }

    /* Leaderboard specific table styling (general for all leaderboard tables) */
    .leaderboard-table {
        border-radius: 12px;
        box-shadow: 0 4px 15px rgba(0,0,0,0.08);
        overflow: hidden;
        margin-bottom: 25px; /* Space between tables */
    }
    .leaderboard-table table {
        border-collapse: separate;
        border-spacing: 0;
        width: 100%;
        background-color: #ffffff;
    }
    .leaderboard-table thead th {
        background-color: #edf2f7; /* Light grey header */
        color: #2d3748;
        font-weight: 700;
        padding: 15px 20px;
        text-align: left;
        border-bottom: 2px solid #e2e8f0;
    }
    .leaderboard-table tbody tr {
        transition: background-color 0.2s ease;
    }
    .leaderboard-table tbody tr:nth-child(odd) {
        background-color: #f7fafc; /* Zebra striping */
    }
    .leaderboard-table tbody tr:hover {
        background-color: #e6fffa; /* Light teal on hover for rows */
    }
    .leaderboard-table tbody td {
        padding: 12px 20px;
        border-bottom: 1px solid #ebf4f8;
        color: #4a5568;
    }
    .leaderboard-table tbody tr:last-child td {
        border-bottom: none;
    }
    .leaderboard-table tbody tr:first-child td {
        border-top-left-radius: 12px;
        border-top-right-radius: 12px;
    }
    .leaderboard-table tbody tr:last-child td {
        border-bottom-left-radius: 12px;
        border-bottom-right-radius: 12px;
    }

    /* Radio button group for leaderboard */
    #leaderboard-toggle.gr-form {
        display: flex;
        justify-content: center;
        padding: 0px 0px 20px 0px; /* Reduced padding for more compact look */
    }
    #leaderboard-toggle label.gr-radio-label {
        font-size: 1.1em;
        font-weight: 600;
        color: #2d3748;
        padding: 10px 20px;
        border-radius: 8px;
        background-color: #edf2f7; /* Light background for unselected */
        border: 1px solid #e2e8f0;
        cursor: pointer;
        transition: all 0.3s ease;
        margin: 0 5px; /* Spacing between radio buttons */
    }
    #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label {
        background-color: #2f80ed; /* Blue for selected */
        color: white;
        border-color: #2f80ed;
        box-shadow: 0 3px 10px rgba(47, 128, 237, 0.3);
    }
    #leaderboard-toggle input[type="radio"]:checked + label.gr-radio-label:hover {
        background-color: #1a6dcd; /* Darker blue on hover */
    }
    #leaderboard-toggle label.gr-radio-label:hover {
        background-color: #e2e8f0; /* Lighter grey on hover */
    }

    /* Radio button group for evaluation benchmark selection */
    #eval-benchmark-selection {
        display: flex;
        justify-content: center;
        margin-bottom: 20px; /* Space above dropdown */
    }
    #eval-benchmark-selection label.gr-radio-label {
        font-size: 1.05em;
        font-weight: 500;
        color: #4a5568;
        padding: 8px 15px;
        border-radius: 6px;
        background-color: #f0f4f7;
        border: 1px solid #d9e3ed;
        cursor: pointer;
        transition: all 0.3s ease;
        margin: 0 5px;
    }
    #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label {
        background-color: #48bb78; /* A pleasant green for evaluation selection */
        color: white;
        border-color: #48bb78;
        box-shadow: 0 2px 8px rgba(72, 187, 120, 0.2);
    }
    #eval-benchmark-selection input[type="radio"]:checked + label.gr-radio-label:hover {
        background-color: #38a169;
    }
    #eval-benchmark-selection label.gr-radio-label:hover {
        background-color: #e5edf2;
    }


""") as demo:
    gr.Markdown("""
    # 🤖 LLM Benchmark Evaluator
    """)

    with gr.Tabs():
        with gr.TabItem("🚀 Run Evaluation"):
            gr.Markdown("""
            <div class="markdown-text">
                Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
                select a subject (or 'ALL' for a comprehensive evaluation),
                and specify the number of samples per subject.
                Ensure your Hugging Face token is set as an environment variable for private models.
            </div>
            """)
            
            with gr.Column(elem_classes="gr-box"):
                model_id_input = gr.Textbox(
                    label="Your Hugging Face Model ID", 
                    placeholder="e.g., ICONNAI/ICONN-1-Mini-Beta", 
                    interactive=True
                )
                
                # New Radio button for benchmark selection for evaluation
                benchmark_selection_radio = gr.Radio(
                    ["MMLU"],
                    label="Select Benchmark Type",
                    value="MMLU", # Default selection
                    interactive=True,
                    container=False, # Important for custom styling placement
                    elem_id="eval-benchmark-selection"
                )

                with gr.Row():
                    benchmark_subject_dropdown = gr.Dropdown(
                        label="Choose Subject", # Label changed to be more concise
                        choices=INITIAL_GRADIO_DROPDOWN_OPTIONS, # Initial choices (MMLU subjects)
                        value="all", # Default to ALL for MMLU initially
                        interactive=True,
                        min_width=400,
                        visible=False
                    )
                    sample_count_slider = gr.Slider(
                        label="Number of Samples per Subject (1-100)",
                        minimum=1,
                        maximum=100, 
                        value=100, 
                        step=1,
                        interactive=True,
                        min_width=200,
                        visible=False
                    )
                run_button = gr.Button("Run Evaluation", elem_classes="gr-button")

            gr.Markdown("<hr>") # Visual separator

            with gr.Column(elem_classes="gr-box"):
                acc_output = gr.Textbox(
                    label="Benchmark Accuracy Results", 
                    interactive=False, 
                    elem_classes="gr-output-text", 
                    lines=5,
                    placeholder="Evaluation results will appear here."
                )
                
            # Define button click actions
            run_button.click(
                run_evaluation,
                inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider], # Updated inputs
                outputs=[
                    acc_output, 
                    error_message_output, debug_error_column, # For error state
                    show_details_button, download_button, detail_output # For success state
                ]
            )

            # Link benchmark selection radio to subject dropdown
            benchmark_selection_radio.change(
                update_subject_dropdown_choices,
                inputs=[benchmark_selection_radio],
                outputs=[benchmark_subject_dropdown]
            )

            # Toggle visibility of detail_output
            show_details_button.click(
                lambda s: gr.update(visible=not s), # Toggle visibility
                inputs=[detail_output], # Pass the component itself as input
                outputs=[detail_output] # The component to update
            )

            # Toggle visibility of debug error column
            debug_button.click(
                lambda s: gr.update(visible=not s), # Toggle visibility
                inputs=[debug_error_column], # Pass the component itself as input
                outputs=[debug_error_column] # The component to update
            )
            # Change debug button text based on visibility
            debug_button.click(
                lambda s: "🐛 Show Debug Info" if not s else "🐛 Hide Debug Info",
                inputs=[debug_error_column],
                outputs=[debug_button]
            )

            download_button.click(
                save_text,
                inputs=[detail_output],
                outputs=gr.File(label="Download Evaluation Results", file_count="single", type="filepath")
            )

        with gr.TabItem("📊 Leaderboard"):
            gr.Markdown("""
            <div class="markdown-text">
                Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks.
                This leaderboard is updated automatically with each new evaluation.
            </div>
            """)
            
            # Leaderboard Type Toggle
            leaderboard_type_toggle = gr.Radio(
                ["MMLU", "MMLU-Pro"],
                label="Select Benchmark for Leaderboard",
                value="MMLU", # Default to MMLU
                interactive=True,
                container=False, # Make it inline with content
                elem_id="leaderboard-toggle"
            )

            # Leaderboard Table
            leaderboard_table_output = gr.Dataframe(
                headers=["Model ID", "Average Accuracy (%)"],
                interactive=False,
                datatype=["str", "number"],
                row_count=10,
                col_count=2,
                label="Benchmark Leaderboard Data",
                elem_classes="leaderboard-table" # Apply custom class for styling
            )
            
            # Initial load and dynamic update for the leaderboard
            demo.load(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
            leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])

# Launch the Gradio app
demo.launch()