Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Runtime error

File size: 12,872 Bytes


# # demo.launch()
# import gradio as gr
# import pandas as pd
# import os
# import re
# from datetime import datetime

# LEADERBOARD_FILE = "leaderboard.csv"  # File to store all submissions persistently
# LAST_UPDATED = datetime.now().strftime("%B %d, %Y")

# def initialize_leaderboard_file():
#     """
#     Ensure the leaderboard file exists and has the correct headers.
#     """
#     if not os.path.exists(LEADERBOARD_FILE):
#         # Create the file with headers
#         pd.DataFrame(columns=[
#             "Model Name", "Overall Accuracy", "Valid Accuracy",
#             "Correct Predictions", "Total Questions", "Timestamp"
#         ]).to_csv(LEADERBOARD_FILE, index=False)
#     else:
#         # Check if the file is empty and write headers if needed
#         if os.stat(LEADERBOARD_FILE).st_size == 0:
#             pd.DataFrame(columns=[
#                 "Model Name", "Overall Accuracy", "Valid Accuracy",
#                 "Correct Predictions", "Total Questions", "Timestamp"
#             ]).to_csv(LEADERBOARD_FILE, index=False)

# def clean_answer(answer):
#     """
#     Clean and normalize the predicted answers.
#     """
#     if pd.isna(answer):
#         return None
#     answer = str(answer)
#     clean = re.sub(r'[^A-Da-d]', '', answer)
#     if clean:
#         return clean[0].upper()
#     return None

# def update_leaderboard(results):
#     """
#     Append new submission results to the leaderboard file.
#     """
#     new_entry = {
#         "Model Name": results['model_name'],
#         "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
#         "Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
#         "Correct Predictions": results['correct_predictions'],
#         "Total Questions": results['total_questions'],
#         "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
#     }

#     new_entry_df = pd.DataFrame([new_entry])
#     new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)

# def load_leaderboard():
#     """
#     Load all submissions from the leaderboard file.
#     """
#     if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
#         return pd.DataFrame({
#             "Model Name": [],
#             "Overall Accuracy": [],
#             "Valid Accuracy": [],
#             "Correct Predictions": [],
#             "Total Questions": [],
#             "Timestamp": [],
#         })
#     return pd.read_csv(LEADERBOARD_FILE)

# def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
#     """
#     Evaluate predictions and optionally add results to the leaderboard.
#     """
#     ground_truth_file = "ground_truth.csv"
#     if not os.path.exists(ground_truth_file):
#         return "Ground truth file not found.", load_leaderboard()
#     if not prediction_file:
#         return "Prediction file not uploaded.", load_leaderboard()

#     try:
#         # Load predictions and ground truth
#         predictions_df = pd.read_csv(prediction_file.name)
#         ground_truth_df = pd.read_csv(ground_truth_file)

#         # Merge predictions with ground truth
#         merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
#         merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)

#         # Evaluate predictions
#         valid_predictions = merged_df.dropna(subset=['pred_answer'])
#         correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
#         total_predictions = len(merged_df)
#         total_valid_predictions = len(valid_predictions)

#         # Calculate accuracy
#         overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
#         valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0

#         results = {
#             'model_name': model_name if model_name else "Unknown Model",
#             'overall_accuracy': overall_accuracy,
#             'valid_accuracy': valid_accuracy,
#             'correct_predictions': correct_predictions,
#             'total_questions': total_predictions,
#         }

#         # Update leaderboard only if opted in
#         if add_to_leaderboard:
#             update_leaderboard(results)
#             return "Evaluation completed and added to leaderboard.", load_leaderboard()
#         else:
#             return "Evaluation completed but not added to leaderboard.", load_leaderboard()
#     except Exception as e:
#         return f"Error during evaluation: {str(e)}", load_leaderboard()

# # Initialize leaderboard file
# initialize_leaderboard_file()

# # Gradio Interface
# with gr.Blocks() as demo:
#     gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
    
#     with gr.Tabs():
#         # Submission Tab
#         with gr.TabItem("🏅 Submission"):
#             file_input = gr.File(label="Upload Prediction CSV")
#             model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
#             add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True)
#             eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
#             leaderboard_table_preview = gr.Dataframe(
#                 value=load_leaderboard(),
#                 label="Leaderboard (Preview)",
#                 interactive=False,
#                 wrap=True,
#             )
#             eval_button = gr.Button("Evaluate and Update Leaderboard")
#             eval_button.click(
#                 evaluate_predictions,
#                 inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
#                 outputs=[eval_status, leaderboard_table_preview],
#             )
        
#         # Leaderboard Tab
#         with gr.TabItem("🏅 Leaderboard"):
#             leaderboard_table = gr.Dataframe(
#                 value=load_leaderboard(),
#                 label="Leaderboard",
#                 interactive=False,
#                 wrap=True,
#             )
#             refresh_button = gr.Button("Refresh Leaderboard")
#             refresh_button.click(
#                 lambda: load_leaderboard(),
#                 inputs=[],
#                 outputs=[leaderboard_table],
#             )

#     gr.Markdown(f"Last updated on **{LAST_UPDATED}**")

# demo.launch()


import gradio as gr
import pandas as pd
import os
import re
from datetime import datetime
from huggingface_hub import hf_hub_download

LEADERBOARD_FILE = "leaderboard.csv"  # File to store all submissions persistently
GROUND_TRUTH_FILE = "ground_truth.csv"  # File for ground truth data
LAST_UPDATED = datetime.now().strftime("%B %d, %Y")

# Disable symlink warnings
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

def initialize_leaderboard_file():
    """
    Ensure the leaderboard file exists and has the correct headers.
    """
    if not os.path.exists(LEADERBOARD_FILE):
        # Create the file with headers
        pd.DataFrame(columns=[
            "Model Name", "Overall Accuracy", "Valid Accuracy",
            "Correct Predictions", "Total Questions", "Timestamp"
        ]).to_csv(LEADERBOARD_FILE, index=False)
    else:
        # Check if the file is empty and write headers if needed
        if os.stat(LEADERBOARD_FILE).st_size == 0:
            pd.DataFrame(columns=[
                "Model Name", "Overall Accuracy", "Valid Accuracy",
                "Correct Predictions", "Total Questions", "Timestamp"
            ]).to_csv(LEADERBOARD_FILE, index=False)

def clean_answer(answer):
    """
    Clean and normalize the predicted answers.
    """
    if pd.isna(answer):
        return None
    answer = str(answer)
    clean = re.sub(r'[^A-Da-d]', '', answer)
    if clean:
        return clean[0].upper()
    return None

def update_leaderboard(results):
    """
    Append new submission results to the leaderboard file.
    """
    new_entry = {
        "Model Name": results['model_name'],
        "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
        "Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
        "Correct Predictions": results['correct_predictions'],
        "Total Questions": results['total_questions'],
        "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    }

    new_entry_df = pd.DataFrame([new_entry])
    new_entry_df.to_csv(LEADERBOARD_FILE, mode='a', index=False, header=False)

def load_leaderboard():
    """
    Load all submissions from the leaderboard file.
    """
    if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
        return pd.DataFrame({
            "Model Name": [],
            "Overall Accuracy": [],
            "Valid Accuracy": [],
            "Correct Predictions": [],
            "Total Questions": [],
            "Timestamp": [],
        })
    return pd.read_csv(LEADERBOARD_FILE)

def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
    """
    Evaluate predictions and optionally add results to the leaderboard.
    """
    try:
        # Load ground truth data
        ground_truth_path = hf_hub_download(
            repo_id="SondosMB/ground-truth-dataset",
            filename=GROUND_TRUTH_FILE,
            use_auth_token=True
        )
        ground_truth_df = pd.read_csv(ground_truth_path)
    except Exception as e:
        return f"Error loading ground truth: {e}", load_leaderboard()

    if not prediction_file:
        return "Prediction file not uploaded.", load_leaderboard()

    try:
        # Load predictions and merge with ground truth
        predictions_df = pd.read_csv(prediction_file.name)
        merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
        merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)

        # Evaluate predictions
        valid_predictions = merged_df.dropna(subset=['pred_answer'])
        correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
        total_predictions = len(merged_df)
        total_valid_predictions = len(valid_predictions)

        # Calculate accuracy
        overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0

        results = {
            'model_name': model_name if model_name else "Unknown Model",
            'overall_accuracy': overall_accuracy,
            'valid_accuracy': valid_accuracy,
            'correct_predictions': correct_predictions,
            'total_questions': total_predictions,
        }

        # Update leaderboard only if opted in
        if add_to_leaderboard:
            update_leaderboard(results)
            return "Evaluation completed and added to leaderboard.", load_leaderboard()
        else:
            return "Evaluation completed but not added to leaderboard.", load_leaderboard()
    except Exception as e:
        return f"Error during evaluation: {str(e)}", load_leaderboard()

# Initialize leaderboard file
initialize_leaderboard_file()

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# Prediction Evaluation Tool with Leaderboard")

    with gr.Tabs():
        # Submission Tab
        with gr.TabItem("🏅 Submission"):
            file_input = gr.File(label="Upload Prediction CSV")
            model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
            add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True)
            eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
            leaderboard_table_preview = gr.Dataframe(
                value=load_leaderboard(),
                label="Leaderboard (Preview)",
                interactive=False,
                wrap=True,
            )
            eval_button = gr.Button("Evaluate and Update Leaderboard")
            eval_button.click(
                evaluate_predictions,
                inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
                outputs=[eval_status, leaderboard_table_preview],
            )

        # Leaderboard Tab
        with gr.TabItem("🏅 Leaderboard"):
            leaderboard_table = gr.Dataframe(
                value=load_leaderboard(),
                label="Leaderboard",
                interactive=False,
                wrap=True,
            )
            refresh_button = gr.Button("Refresh Leaderboard")
            refresh_button.click(
                lambda: load_leaderboard(),
                inputs=[],
                outputs=[leaderboard_table],
            )

    gr.Markdown(f"Last updated on **{LAST_UPDATED}**")

demo.launch()