import gradio as gr
import pandas as pd
import os
import re
from datetime import datetime
from huggingface_hub import hf_hub_download
from huggingface_hub import HfApi, HfFolder
LEADERBOARD_FILE = "leaderboard.csv"
GROUND_TRUTH_FILE = "ground_truth.csv"
LAST_UPDATED ="%B %d, %Y")
# Ensure authentication and suppress warnings
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
raise ValueError("HF_TOKEN environment variable is not set or invalid.")
def initialize_leaderboard_file():
Ensure the leaderboard file exists and has the correct headers.
if not os.path.exists(LEADERBOARD_FILE):
"Model Name", "Overall Accuracy", "Valid Accuracy",
"Correct Predictions", "Total Questions", "Timestamp"
]).to_csv(LEADERBOARD_FILE, index=False)
elif os.stat(LEADERBOARD_FILE).st_size == 0:
"Model Name", "Overall Accuracy", "Valid Accuracy",
"Correct Predictions", "Total Questions", "Timestamp"
]).to_csv(LEADERBOARD_FILE, index=False)
def clean_answer(answer):
if pd.isna(answer):
return None
answer = str(answer)
clean = re.sub(r'[^A-Da-d]', '', answer)
return clean[0].upper() if clean else None
def update_leaderboard(results):
Append new submission results to the leaderboard file and push updates to the Hugging Face repository.
new_entry = {
"Model Name": results['model_name'],
"Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
"Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
"Correct Predictions": results['correct_predictions'],
"Total Questions": results['total_questions'],
"Timestamp":"%Y-%m-%d %H:%M:%S"),
# Update the local leaderboard file
new_entry_df = pd.DataFrame([new_entry])
file_exists = os.path.exists(LEADERBOARD_FILE)
mode='a', # Append mode
header=not file_exists # Write header only if the file is new
print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}")
# Push the updated file to the Hugging Face repository using HTTP API
api = HfApi()
token = HfFolder.get_token()
repo_id="SondosMB/ss", # Your Space repository
print("Leaderboard changes pushed to Hugging Face repository.")
except Exception as e:
print(f"Error updating leaderboard file: {e}")
def load_leaderboard():
if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
return pd.DataFrame({
"Model Name": [],
"Overall Accuracy": [],
"Valid Accuracy": [],
"Correct Predictions": [],
"Total Questions": [],
"Timestamp": [],
return pd.read_csv(LEADERBOARD_FILE)
def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
ground_truth_path = hf_hub_download(
ground_truth_df = pd.read_csv(ground_truth_path)
except FileNotFoundError:
return "Ground truth file not found in the dataset repository.", load_leaderboard()
except Exception as e:
return f"Error loading ground truth: {e}", load_leaderboard()
if not prediction_file:
return "Prediction file not uploaded.", load_leaderboard()
#load predition file
predictions_df = pd.read_csv(
# Validate required columns in prediction file
required_columns = ['question_id', 'predicted_answer']
missing_columns = [col for col in required_columns if col not in predictions_df.columns]
if missing_columns:
return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
# Validate 'Answer' column in ground truth file
if 'Answer' not in ground_truth_df.columns:
return "Error: 'Answer' column is missing in the ground truth dataset.", load_leaderboard()
merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
valid_predictions = merged_df.dropna(subset=['pred_answer'])
correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
total_predictions = len(merged_df)
total_valid_predictions = len(valid_predictions)
overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
results = {
'model_name': model_name if model_name else "Unknown Model",
'overall_accuracy': overall_accuracy,
'valid_accuracy': valid_accuracy,
'correct_predictions': correct_predictions,
'total_questions': total_predictions,
if add_to_leaderboard:
return "Evaluation completed and added to leaderboard.", load_leaderboard()
return "Evaluation completed but not added to leaderboard.", load_leaderboard()
except Exception as e:
return f"Error during evaluation: {str(e)}", load_leaderboard()
# Function to set default mode
# Function to set default mode
import gradio as gr
# # Custom CSS to match website style
# # Define CSS to match a modern, professional design
# # Define enhanced CSS for the entire layout
css_tech_theme = """
body {
font-family: 'Roboto', sans-serif;
background-color: #f4f6fa;
color: #333333;
margin: 0;
padding: 0;
/* Header Styling */
header {
text-align: center;
/*padding: 60px 20px;*/
background: linear-gradient(135deg, #6a1b9a, #64b5f6);
color: #ffffff;
border-radius: 12px;
margin-bottom: 30px;
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
header h1 {
font-size: 3.5em;
font-weight: bold;
margin-bottom: 10px;
header h2 {
font-size: 2em;
margin-bottom: 15px;
header p {
font-size: 1em;
line-height: 1.8;
.header-buttons {
display: flex;
justify-content: center;
gap: 15px;
margin-top: 20px;
.header-buttons a {
text-decoration: none;
font-size: 1.5em;
padding: 15px 30px;
border-radius: 30px;
font-weight: bold;
background: #ffffff;
color: #6a1b9a;
transition: transform 0.3s, background 0.3s;
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
.header-buttons a:hover {
background: #64b5f6;
color: #ffffff;
transform: scale(1.05);
/* Pre-Tabs Section */
.pre-tabs {
text-align: center;
padding: 40px 20px;
background: linear-gradient(135deg, #ffffff, #f9fafb);
border-top: 5px solid #64b5f6;
border-bottom: 5px solid #6a1b9a;
.pre-tabs h2, .post-tabs h2 {
font-size: 3em; /* Increase the size for better visibility */
.pre-tabs p, .post-tabs p {
font-size: 2.5em; /* Adjust paragraph text size */
.pre-tabs h2 {
color: #333333;
margin-bottom: 15px;
.pre-tabs p {
color: #555555;
line-height: 1.8;
/* Tabs Section */
.tabs {
margin: 0 auto;
padding: 20px;
background: #ffffff;
border-radius: 12px;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
/* max-width: 1300px; /* change 1 */ */
/* Post-Tabs Section */
.post-tabs {
text-align: center;
padding: 40px 20px;
background: linear-gradient(135deg, #64b5f6, #6a1b9a);
color: #ffffff;
border-radius: 12px;
margin-top: 30px;
.post-tabs h2 {
font-size: 3.4em;
margin-bottom: 15px;
.post-tabs p {
font-size: 2em;
line-height: 1.8;
margin-bottom: 20px;
.post-tabs a {
text-decoration: none;
font-size: 1.1em;
padding: 15px 30px;
border-radius: 30px;
font-weight: bold;
background: #ffffff;
color: #6a1b9a;
transition: transform 0.3s, background 0.3s;
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
.post-tabs a:hover {
background: #6a1b9a;
color: #ffffff;
transform: scale(1.05);
/* Footer */
footer {
background: linear-gradient(135deg, #6a1b9a, #8e44ad);
color: #ffffff;
text-align: center;
padding: 40px 20px;
margin-top: 30px;
border-radius: 12px;
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.2);
footer h2 {
font-size: 1.5em;
margin-bottom: 15px;
footer p {
font-size: 0.8em;
line-height: 1.6;
margin-bottom: 20px;
/* Link Styling */
.social-links {
display: flex;
justify-content: center;
gap: 15px; /* Space between links */
.social-link {
display: inline-block;
text-decoration: none;
color: #ffffff;
background-color: #6a1b9a; /* Purple button background */
padding: 10px 20px;
border-radius: 30px;
font-size: 16px;
font-weight: bold;
transition: all 0.3s ease;
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
.social-link:hover {
background-color: #8c52d3; /* Darker shade on hover */
box-shadow: 0 6px 15px rgba(0, 0, 0, 0.2);
transform: translateY(-2px);
.social-link:active {
transform: translateY(1px);
box-shadow: 0 3px 8px rgba(0, 0, 0, 0.1);
/* Submission Section Styling */
/* Submission Section Styling */
.submission-section {
margin: 40px auto;
padding: 30px;
background: linear-gradient(135deg, #ffffff, #f9f9ff);
border-radius: 12px;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
/*max-width: 800px; change 1*/
text-align: center;
.submission-section h2 {
font-size: 2.5em;
color: #6a1b9a;
margin-bottom: 20px;
font-weight: bold;
.submission-section p {
font-size: 1.2em;
color: #333;
margin-bottom: 30px;
#submission-fields {
flex-direction: column;
gap: 20px;
align-items: center;
#submission-fields input[type="file"],
#submission-fields input[type="text"] {
width: 90%;
padding: 12px 15px;
font-size: 1em;
border: 2px solid #d3bce8;
border-color: #5e1287;
border-radius: 8px;
box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05);
transition: border-color 0.3s ease;
#submission-results {
margin-top: 20px;
text-align: center;
#submission-buttons {
display: flex;
justify-content: center;
gap: 15px;
margin-top: 20px;
#submission-buttons button {
padding: 10px 20px;
font-size: 1em;
color: #ffffff;
background: #6a1b9a;
border: none;
border-radius: 30px;
cursor: pointer;
font-weight: bold;
transition: background 0.3s ease, transform 0.3s ease;
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
#submission-buttons button:hover {
background: #8c52d3;
transform: scale(1.05);
#submission-buttons button:active {
background: #5e1287;
transform: scale(0.98);
#submission-fields input[type="file"]:hover,
#submission-fields input[type="text"]:hover {
border-color: #6a1b9a; /* Darker purple on hover */
#submission-fields input[type="file"]:focus,
#submission-fields input[type="text"]:focus {
border-color: #6a1b9a; /* Darker purple on focus */
outline: none;
box-shadow: 0 0 8px rgba(106, 27, 154, 0.5); /* Glow effect */
/* Evaluation Status Styling */
#evaluation-status {
margin-top: 15px;
padding: 10px 20px;
font-size: 1em;
border-radius: 8px;
border: 2px solid #6a1b9a; /* Matches the hover color */
background: #f9f7fd; /* Subtle purple background */
color: #333;
font-weight: bold;
box-shadow: inset 0 2px 5px rgba(0, 0, 0, 0.1);
/* Buttons Styling */
#submission-buttons button {
padding: 12px 25px;
font-size: 1.1em;
color: #ffffff;
background: #6a1b9a;
border: none;
border-radius: 30px;
cursor: pointer;
font-weight: bold;
transition: all 0.3s ease;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
#submission-buttons button:hover {
background: #8c52d3; /* Slightly lighter purple */
transform: scale(1.05);
box-shadow: 0 6px 15px rgba(0, 0, 0, 0.2);
#submission-buttons button:active {
background: #5e1287; /* Darker purple */
transform: scale(0.98);
box-shadow: 0 3px 10px rgba(0, 0, 0, 0.1);
# Create the Gradio Interface
with gr.Blocks(css=css_tech_theme) as demo:
# Header Section
The Mobile-MMLU Benchmark Competition offers a unique opportunity to evaluate your LLMs in real-world mobile scenarios. Join the challenge to drive innovation, showcase your expertise, and shape the future of mobile AI.
π Mobile-MMLU Challenge
π Pushing the Limits of Mobile LLMs
Why Participate?
The Mobile-MMLU Benchmark Competition is a premier challenge designed to evaluate and advance mobile-optimized Large Language Models (LLMs). This competition is an excellent opportunity to showcase your model's ability to handle real-world scenarios and excel in mobile intelligence.
With a dataset spanning 80 distinct fields and featuring 16,186 questions, the competition emphasizes practical applications, from education and healthcare to technology and daily life.
Participating in this competition allows you to:
Upload your prediction file and provide your model name to evaluate and submit to the leaderboard.