import gradio as gr import pandas as pd import os import re from datetime import datetime from huggingface_hub import hf_hub_download from huggingface_hub import HfApi, HfFolder LEADERBOARD_FILE = "leaderboard.csv" GROUND_TRUTH_FILE = "ground_truth.csv" LAST_UPDATED = datetime.now().strftime("%B %d, %Y") # Ensure authentication and suppress warnings os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError("HF_TOKEN environment variable is not set or invalid.") def initialize_leaderboard_file(): """ Ensure the leaderboard file exists and has the correct headers. """ if not os.path.exists(LEADERBOARD_FILE): pd.DataFrame(columns=[ "Model Name", "Overall Accuracy", "Valid Accuracy", "Correct Predictions", "Total Questions", "Timestamp" ]).to_csv(LEADERBOARD_FILE, index=False) elif os.stat(LEADERBOARD_FILE).st_size == 0: pd.DataFrame(columns=[ "Model Name", "Overall Accuracy", "Valid Accuracy", "Correct Predictions", "Total Questions", "Timestamp" ]).to_csv(LEADERBOARD_FILE, index=False) def clean_answer(answer): if pd.isna(answer): return None answer = str(answer) clean = re.sub(r'[^A-Da-d]', '', answer) return clean[0].upper() if clean else None def update_leaderboard(results): """ Append new submission results to the leaderboard file and push updates to the Hugging Face repository. """ new_entry = { "Model Name": results['model_name'], "Overall Accuracy": round(results['overall_accuracy'] * 100, 2), "Valid Accuracy": round(results['valid_accuracy'] * 100, 2), "Correct Predictions": results['correct_predictions'], "Total Questions": results['total_questions'], "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } try: # Update the local leaderboard file new_entry_df = pd.DataFrame([new_entry]) file_exists = os.path.exists(LEADERBOARD_FILE) new_entry_df.to_csv( LEADERBOARD_FILE, mode='a', # Append mode index=False, header=not file_exists # Write header only if the file is new ) print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}") # Push the updated file to the Hugging Face repository using HTTP API api = HfApi() token = HfFolder.get_token() api.upload_file( path_or_fileobj=LEADERBOARD_FILE, path_in_repo="leaderboard.csv", repo_id="SondosMB/ss", # Your Space repository repo_type="space", token=token ) print("Leaderboard changes pushed to Hugging Face repository.") except Exception as e: print(f"Error updating leaderboard file: {e}") def load_leaderboard(): if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0: return pd.DataFrame({ "Model Name": [], "Overall Accuracy": [], "Valid Accuracy": [], "Correct Predictions": [], "Total Questions": [], "Timestamp": [], }) return pd.read_csv(LEADERBOARD_FILE) def evaluate_predictions(prediction_file, model_name, add_to_leaderboard): try: ground_truth_path = hf_hub_download( repo_id="SondosMB/ground-truth-dataset", filename="ground_truth.csv", repo_type="dataset", use_auth_token=True ) ground_truth_df = pd.read_csv(ground_truth_path) except FileNotFoundError: return "Ground truth file not found in the dataset repository.", load_leaderboard() except Exception as e: return f"Error loading ground truth: {e}", load_leaderboard() if not prediction_file: return "Prediction file not uploaded.", load_leaderboard() try: predictions_df = pd.read_csv(prediction_file.name) merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner') merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer) valid_predictions = merged_df.dropna(subset=['pred_answer']) correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum() total_predictions = len(merged_df) total_valid_predictions = len(valid_predictions) overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0 valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0 results = { 'model_name': model_name if model_name else "Unknown Model", 'overall_accuracy': overall_accuracy, 'valid_accuracy': valid_accuracy, 'correct_predictions': correct_predictions, 'total_questions': total_predictions, } if add_to_leaderboard: update_leaderboard(results) return "Evaluation completed and added to leaderboard.", load_leaderboard() else: return "Evaluation completed but not added to leaderboard.", load_leaderboard() except Exception as e: return f"Error during evaluation: {str(e)}", load_leaderboard() initialize_leaderboard_file() # Function to set default mode # Function to set default mode import gradio as gr # # Ensure CSS is correctly defined # css_tech_theme = """ # body { # background-color: #f4f6fa; # color: #333333; # font-family: 'Roboto', sans-serif; # line-height: 1.8; # } # .center-content { # display: flex; # flex-direction: column; # align-items: center; # justify-content: center; # text-align: center; # margin: 30px 0; # padding: 20px; # } # h1, h2 { # color: #5e35b1; # margin: 15px 0; # text-align: center; # } # img { # width: 100px; # height: 100px; # } # """ # # Create the Gradio Interface # with gr.Blocks(css=css_tech_theme) as demo: # gr.Markdown(""" #
#

🏆 Mobile-MMLU Benchmark Competition

#

🌟 Welcome to the Competition

#

# Welcome to the Mobile-MMLU Benchmark Competition. Here you can submit your predictions, # view the leaderboard, and track your performance! #

#
#
# """) # with gr.Tabs(elem_id="tabs"): # with gr.TabItem("📖 Overview"): # gr.Markdown(""" # **Welcome to the Mobile-MMLU Benchmark Competition! Evaluate mobile-compatible Large Language Models (LLMs) on 16,186 scenario-based and factual questions across 80 fields**. # --- # ## What is Mobile-MMLU? # Mobile-MMLU is a benchmark designed to test the capabilities of LLMs optimized for mobile use. Contribute to advancing mobile AI systems by competing to achieve the highest accuracy. # --- # ## How It Works # 1. **Download the Dataset** # Access the dataset and instructions on our [GitHub page](https://github.com/your-github-repo). # 2. **Generate Predictions** # Use your LLM to answer the dataset questions. Format your predictions as a CSV file. # 3. **Submit Predictions** # Upload your predictions on this platform. # 4. **Evaluation** # Submissions are scored on accuracy. # 5. **Leaderboard** # View real-time rankings on the leaderboard. # --- # """) # with gr.TabItem("📤 Submission"): # with gr.Row(): # file_input = gr.File(label="Upload Prediction CSV", file_types=[".csv"], interactive=True) # model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name") # with gr.Row(): # overall_accuracy_display = gr.Number(label="Overall Accuracy", interactive=False) # add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True) # eval_button = gr.Button("Evaluate") # eval_status = gr.Textbox(label="Evaluation Status", interactive=False) # def handle_evaluation(file, model_name, add_to_leaderboard): # status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard) # if leaderboard.empty: # overall_accuracy = 0 # else: # overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"] # return status, overall_accuracy # eval_button.click( # handle_evaluation, # inputs=[file_input, model_name_input, add_to_leaderboard_checkbox], # outputs=[eval_status, overall_accuracy_display], # ) # with gr.TabItem("🏅 Leaderboard"): # leaderboard_table = gr.Dataframe( # value=load_leaderboard(), # label="Leaderboard", # interactive=False, # wrap=True, # ) # refresh_button = gr.Button("Refresh Leaderboard") # refresh_button.click( # lambda: load_leaderboard(), # inputs=[], # outputs=[leaderboard_table], # ) # gr.Markdown(f"Last updated on **{LAST_UPDATED}**") # demo.launch() import gradio as gr # Custom CSS to match website style # Define CSS to match a modern, professional design css_tech_theme = """ body { font-family: 'Roboto', sans-serif; background-color: #f4f6fa; color: #333333; line-height: 1.8; margin: 0; padding: 0; } .center-content { display: flex; flex-direction: column; align-items: center; justify-content: center; text-align: center; margin: 40px auto; padding: 20px; background: linear-gradient(135deg, #6a1b9a, #64b5f6); color: #ffffff; border-radius: 10px; max-width: 80%; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2); } .center-content h1 { font-size: 3em; font-weight: bold; margin-bottom: 10px; } .center-content h2 { font-size: 1.8em; margin: 10px 0 20px; font-weight: 500; } .center-content p { font-size: 1.2em; margin-bottom: 20px; line-height: 1.6; } .tabs { margin-top: 20px; } .gradio-container { background: #ffffff; border-radius: 10px; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); padding: 20px; max-width: 1200px; margin: 0 auto; } #leaderboard { margin: 20px auto; border-radius: 10px; overflow: hidden; border: 1px solid #e5eff2; background: #f9f9f9; } footer { text-align: center; padding: 20px; background: #6a1b9a; color: #ffffff; margin-top: 20px; font-size: 0.9em; border-top: 5px solid #64b5f6; } """ # Create the Gradio Interface with gr.Blocks(css=css_tech_theme) as demo: gr.Markdown("""

🏆 Mobile-MMLU Benchmark Competition

🌟 Welcome to the Competition

Welcome to the Mobile-MMLU Benchmark Competition. Submit your predictions, view the leaderboard, and track your performance!


""") with gr.Tabs(elem_id="tabs"): with gr.TabItem("📖 Overview"): gr.Markdown("""

About the Competition

**Mobile-MMLU** evaluates mobile-optimized LLMs on 16,186 scenario-based and factual questions across 80 fields.

Test your model, submit predictions, and climb the leaderboard!

""") with gr.TabItem("📤 Submission"): with gr.Row(): file_input = gr.File(label="Upload Prediction CSV", file_types=[".csv"], interactive=True) model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name") with gr.Row(): overall_accuracy_display = gr.Number(label="Overall Accuracy", interactive=False) add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True) eval_button = gr.Button("Evaluate") eval_status = gr.Textbox(label="Evaluation Status", interactive=False) def handle_evaluation(file, model_name, add_to_leaderboard): status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard) overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"] if not leaderboard.empty else 0 return status, overall_accuracy eval_button.click( handle_evaluation, inputs=[file_input, model_name_input, add_to_leaderboard_checkbox], outputs=[eval_status, overall_accuracy_display], ) with gr.TabItem("🏅 Leaderboard"): leaderboard_table = gr.Dataframe( value=load_leaderboard(), label="Leaderboard", interactive=False, wrap=True, ) refresh_button = gr.Button("Refresh Leaderboard") refresh_button.click( lambda: load_leaderboard(), inputs=[], outputs=[leaderboard_table], ) gr.Markdown("") demo.launch()