Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Sleeping

App Files Files Community

Mobile-MMLU-Challenge / app.py

SondosMB

Update app.py

5dc58a0 verified 11 months ago

raw

history blame

18.3 kB

	import gradio as gr
	import pandas as pd
	import os
	import re
	from datetime import datetime
	from huggingface_hub import hf_hub_download
	from huggingface_hub import HfApi, HfFolder

	LEADERBOARD_FILE = "leaderboard.csv"
	GROUND_TRUTH_FILE = "ground_truth.csv"
	LAST_UPDATED = datetime.now().strftime("%B %d, %Y")

	# Ensure authentication and suppress warnings
	os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
	HF_TOKEN = os.getenv("HF_TOKEN")
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN environment variable is not set or invalid.")

	def initialize_leaderboard_file():
	"""
	Ensure the leaderboard file exists and has the correct headers.
	"""
	if not os.path.exists(LEADERBOARD_FILE):
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)
	elif os.stat(LEADERBOARD_FILE).st_size == 0:
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)

	def clean_answer(answer):
	if pd.isna(answer):
	return None
	answer = str(answer)
	clean = re.sub(r'[^A-Da-d]', '', answer)
	return clean[0].upper() if clean else None


	def update_leaderboard(results):
	"""
	Append new submission results to the leaderboard file and push updates to the Hugging Face repository.
	"""
	new_entry = {
	"Model Name": results['model_name'],
	"Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
	"Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
	"Correct Predictions": results['correct_predictions'],
	"Total Questions": results['total_questions'],
	"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	}

	try:
	# Update the local leaderboard file
	new_entry_df = pd.DataFrame([new_entry])
	file_exists = os.path.exists(LEADERBOARD_FILE)

	new_entry_df.to_csv(
	LEADERBOARD_FILE,
	mode='a', # Append mode
	index=False,
	header=not file_exists # Write header only if the file is new
	)
	print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}")

	# Push the updated file to the Hugging Face repository using HTTP API
	api = HfApi()
	token = HfFolder.get_token()

	api.upload_file(
	path_or_fileobj=LEADERBOARD_FILE,
	path_in_repo="leaderboard.csv",
	repo_id="SondosMB/ss", # Your Space repository
	repo_type="space",
	token=token
	)
	print("Leaderboard changes pushed to Hugging Face repository.")

	except Exception as e:
	print(f"Error updating leaderboard file: {e}")



	def load_leaderboard():
	if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
	return pd.DataFrame({
	"Model Name": [],
	"Overall Accuracy": [],
	"Valid Accuracy": [],
	"Correct Predictions": [],
	"Total Questions": [],
	"Timestamp": [],
	})
	return pd.read_csv(LEADERBOARD_FILE)

	def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
	try:
	ground_truth_path = hf_hub_download(
	repo_id="SondosMB/ground-truth-dataset",
	filename="ground_truth.csv",
	repo_type="dataset",
	use_auth_token=True
	)
	ground_truth_df = pd.read_csv(ground_truth_path)
	except FileNotFoundError:
	return "Ground truth file not found in the dataset repository.", load_leaderboard()
	except Exception as e:
	return f"Error loading ground truth: {e}", load_leaderboard()

	if not prediction_file:
	return "Prediction file not uploaded.", load_leaderboard()

	try:
	predictions_df = pd.read_csv(prediction_file.name)
	merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
	merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)

	valid_predictions = merged_df.dropna(subset=['pred_answer'])
	correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
	total_predictions = len(merged_df)
	total_valid_predictions = len(valid_predictions)

	overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
	valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0

	results = {
	'model_name': model_name if model_name else "Unknown Model",
	'overall_accuracy': overall_accuracy,
	'valid_accuracy': valid_accuracy,
	'correct_predictions': correct_predictions,
	'total_questions': total_predictions,
	}

	if add_to_leaderboard:
	update_leaderboard(results)
	return "Evaluation completed and added to leaderboard.", load_leaderboard()
	else:
	return "Evaluation completed but not added to leaderboard.", load_leaderboard()

	except Exception as e:
	return f"Error during evaluation: {str(e)}", load_leaderboard()

	initialize_leaderboard_file()

	# Function to set default mode
	# Function to set default mode
	import gradio as gr

	# Ensure CSS is correctly defined
	css_tech_theme = """
	body {
	background-color: #f4f6fa;
	color: #333333;
	font-family: 'Roboto', sans-serif;
	line-height: 1.8;
	}

	.center-content {
	display: flex;
	flex-direction: column;
	align-items: center;
	justify-content: center;
	text-align: center;
	margin: 30px 0;
	padding: 20px;
	}

	h1, h2 {
	color: #5e35b1;
	margin: 15px 0;
	text-align: center;
	}
	img {
	width: 100px;
	height: 100px;
	}
	"""

	# Create the Gradio Interface
	with gr.Blocks(css=css_tech_theme) as demo:
	gr.Markdown("""
	<div class="center-content">
	<h1>🏆 Mobile-MMLU Benchmark Competition</h1>
	<h2>🌟 Welcome to the Competition</h2>
	<p>
	Welcome to the Mobile-MMLU Benchmark Competition. Here you can submit your predictions,
	view the leaderboard, and track your performance!
	</p>
	<hr>
	</div>
	""")


	with gr.Tabs(elem_id="tabs"):
	with gr.TabItem("📖 Overview"):
	gr.Markdown("""
	Welcome to the Mobile-MMLU Benchmark Competition! Evaluate mobile-compatible Large Language Models (LLMs) on 16,186 scenario-based and factual questions across 80 fields.
	---
	## What is Mobile-MMLU?
	Mobile-MMLU is a benchmark designed to test the capabilities of LLMs optimized for mobile use. Contribute to advancing mobile AI systems by competing to achieve the highest accuracy.
	---
	## How It Works
	1. Download the Dataset
	Access the dataset and instructions on our [GitHub page](https://github.com/your-github-repo).
	2. Generate Predictions
	Use your LLM to answer the dataset questions. Format your predictions as a CSV file.
	3. Submit Predictions
	Upload your predictions on this platform.
	4. Evaluation
	Submissions are scored on accuracy.
	5. Leaderboard
	View real-time rankings on the leaderboard.
	---
	""")

	with gr.TabItem("📤 Submission"):
	with gr.Row():
	file_input = gr.File(label="Upload Prediction CSV", file_types=[".csv"], interactive=True)
	model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")

	with gr.Row():
	overall_accuracy_display = gr.Number(label="Overall Accuracy", interactive=False)
	add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True)

	eval_button = gr.Button("Evaluate")
	eval_status = gr.Textbox(label="Evaluation Status", interactive=False)

	def handle_evaluation(file, model_name, add_to_leaderboard):
	status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard)
	if leaderboard.empty:
	overall_accuracy = 0
	else:
	overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"]
	return status, overall_accuracy

	eval_button.click(
	handle_evaluation,
	inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
	outputs=[eval_status, overall_accuracy_display],
	)

	with gr.TabItem("🏅 Leaderboard"):
	leaderboard_table = gr.Dataframe(
	value=load_leaderboard(),
	label="Leaderboard",
	interactive=False,
	wrap=True,
	)
	refresh_button = gr.Button("Refresh Leaderboard")
	refresh_button.click(
	lambda: load_leaderboard(),
	inputs=[],
	outputs=[leaderboard_table],
	)

	gr.Markdown(f"Last updated on {LAST_UPDATED}")

	demo.launch()



	# # Custom CSS to match website style
	# # Define CSS to match a modern, professional design
	# # Define enhanced CSS for the entire layout
	# css_tech_theme = """
	# body {
	# font-family: 'Roboto', sans-serif;
	# background-color: #f4f6fa;
	# color: #333333;
	# margin: 0;
	# padding: 0;
	# }

	# /* Header Styling */
	# header {
	# text-align: center;
	# padding: 60px 20px;
	# background: linear-gradient(135deg, #6a1b9a, #64b5f6);
	# color: #ffffff;
	# border-radius: 12px;
	# margin-bottom: 30px;
	# box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2);
	# }

	# header h1 {
	# font-size: 3.5em;
	# font-weight: bold;
	# margin-bottom: 10px;
	# }

	# header h2 {
	# font-size: 2em;
	# margin-bottom: 15px;
	# }

	# header p {
	# font-size: 1.2em;
	# line-height: 1.8;
	# }

	# .header-buttons {
	# display: flex;
	# justify-content: center;
	# gap: 15px;
	# margin-top: 20px;
	# }

	# .header-buttons a {
	# text-decoration: none;
	# font-size: 1.1em;
	# padding: 15px 30px;
	# border-radius: 30px;
	# font-weight: bold;
	# background: #ffffff;
	# color: #6a1b9a;
	# transition: transform 0.3s, background 0.3s;
	# box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	# }

	# .header-buttons a:hover {
	# background: #64b5f6;
	# color: #ffffff;
	# transform: scale(1.05);
	# }

	# /* Pre-Tabs Section */
	# .pre-tabs {
	# text-align: center;
	# padding: 40px 20px;
	# background: linear-gradient(135deg, #ffffff, #f9fafb);
	# border-top: 5px solid #64b5f6;
	# border-bottom: 5px solid #6a1b9a;
	# }

	# .pre-tabs h2 {
	# font-size: 2.5em;
	# color: #333333;
	# margin-bottom: 15px;
	# }

	# .pre-tabs p {
	# font-size: 1.2em;
	# color: #555555;
	# line-height: 1.8;
	# }

	# /* Tabs Section */
	# .tabs {
	# margin: 0 auto;
	# padding: 20px;
	# background: #ffffff;
	# border-radius: 12px;
	# box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
	# max-width: 1200px;
	# }

	# /* Post-Tabs Section */
	# .post-tabs {
	# text-align: center;
	# padding: 40px 20px;
	# background: linear-gradient(135deg, #64b5f6, #6a1b9a);
	# color: #ffffff;
	# border-radius: 12px;
	# margin-top: 30px;
	# }

	# .post-tabs h2 {
	# font-size: 2.5em;
	# margin-bottom: 15px;
	# }

	# .post-tabs p {
	# font-size: 1.2em;
	# line-height: 1.8;
	# margin-bottom: 20px;
	# }

	# .post-tabs a {
	# text-decoration: none;
	# font-size: 1.1em;
	# padding: 15px 30px;
	# border-radius: 30px;
	# font-weight: bold;
	# background: #ffffff;
	# color: #6a1b9a;
	# transition: transform 0.3s, background 0.3s;
	# box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
	# }

	# .post-tabs a:hover {
	# background: #6a1b9a;
	# color: #ffffff;
	# transform: scale(1.05);
	# }

	# /* Footer */
	# footer {
	# background: linear-gradient(135deg, #6a1b9a, #8e44ad);
	# color: #ffffff;
	# text-align: center;
	# padding: 40px 20px;
	# margin-top: 30px;
	# border-radius: 12px;
	# box-shadow: 0 4px 10px rgba(0, 0, 0, 0.2);
	# }

	# footer h2 {
	# font-size: 1.8em;
	# margin-bottom: 15px;
	# }

	# footer p {
	# font-size: 1.1em;
	# line-height: 1.6;
	# margin-bottom: 20px;
	# }

	# footer .social-links {
	# display: flex;
	# justify-content: center;
	# gap: 15px;
	# margin-top: 20px;
	# }

	# footer .social-links a {
	# text-decoration: none;
	# font-size: 1.1em;
	# padding: 10px 20px;
	# border-radius: 8px;
	# font-weight: bold;
	# background: #ffffff;
	# color: #6a1b9a;
	# transition: transform 0.3s, background 0.3s;
	# }

	# footer .social-links a:hover {
	# background: #64b5f6;
	# color: #ffffff;
	# transform: scale(1.1);
	# }
	# """

	# # Gradio Interface
	# with gr.Blocks(css=css_tech_theme) as demo:
	# # Header Section
	# gr.Markdown("""
	# <header>
	# <h1>🏆 Mobile-MMLU Benchmark Competition</h1>
	# <h2>🚀 Push the Boundaries of Mobile AI</h2>
	# <p>
	# Test and optimize mobile-compatible Large Language Models (LLMs) with cutting-edge benchmarks
	# across 80 fields and over 16,000 questions.
	# </p>
	# <div class="header-buttons">
	# <a href="#overview">Learn More</a>
	# <a href="#submission">Submit Predictions</a>
	# <a href="#leaderboard">View Leaderboard</a>
	# </div>
	# </header>
	# """)

	# # Pre-Tabs Section
	# gr.Markdown("""
	# <section class="pre-tabs">
	# <h2>Why Participate?</h2>
	# <p>
	# The Mobile-MMLU Benchmark Competition is a unique opportunity to test your LLMs against
	# real-world scenarios. Compete to drive innovation and make your mark in mobile AI.
	# </p>
	# </section>
	# """)

	# # Tabs Section
	# with gr.Tabs(elem_id="tabs"):
	# # Overview Tab
	# with gr.TabItem("📖 Overview"):
	# gr.Markdown("""
	# <div class="tabs">
	# <h2>About the Competition</h2>
	# <p>
	# The Mobile-MMLU Benchmark Competition is an exciting challenge for mobile-optimized
	# LLMs. Compete to achieve the highest accuracy and contribute to advancements in mobile AI.
	# </p>
	# <h3>How It Works</h3>
	# <ul>
	# <li>1️⃣ <strong>Download the Dataset:</strong> Access the dataset and instructions on our
	# <a href="https://github.com/your-github-repo" target="_blank">GitHub page</a>.</li>
	# <li>2️⃣ <strong>Generate Predictions:</strong> Use your LLM to answer the dataset questions.
	# Format your predictions as a CSV file.</li>
	# <li>3️⃣ <strong>Submit Predictions:</strong> Upload your predictions on this platform.</li>
	# <li>4️⃣ <strong>Evaluation:</strong> Submissions are scored based on accuracy.</li>
	# <li>5️⃣ <strong>Leaderboard:</strong> View real-time rankings on the leaderboard.</li>
	# </ul>
	# </div>
	# """)

	# # Submission Tab
	# with gr.TabItem("📤 Submission"):
	# gr.Markdown("<div class='tabs'><h2>Submit Your Predictions</h2></div>")
	# with gr.Row():
	# file_input = gr.File(label="Upload Prediction CSV", file_types=[".csv"], interactive=True)
	# model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name")
	# with gr.Row():
	# overall_accuracy_display = gr.Number(label="Overall Accuracy", interactive=False)
	# add_to_leaderboard_checkbox = gr.Checkbox(label="Add to Leaderboard?", value=True)
	# eval_button = gr.Button("Evaluate")
	# eval_status = gr.Textbox(label="Evaluation Status", interactive=False)

	# def handle_evaluation(file, model_name, add_to_leaderboard):
	# return "Evaluation complete. Model added to leaderboard.", 85.0

	# eval_button.click(
	# handle_evaluation,
	# inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
	# outputs=[eval_status, overall_accuracy_display],
	# )

	# # Leaderboard Tab
	# with gr.TabItem("🏅 Leaderboard"):
	# leaderboard_table = gr.Dataframe(
	# value=load_leaderboard(), # Initial data
	# label="Leaderboard",
	# interactive=False,
	# wrap=True,)
	# refresh_button = gr.Button("Refresh Leaderboard")
	# refresh_button.click(
	# load_leaderboard, # Fetch latest data
	# inputs=[],
	# outputs=[leaderboard_table],
	# )

	# # Post-Tabs Section
	# gr.Markdown("""
	# <section class="post-tabs">
	# <h2>Ready to Compete?</h2>
	# <p>
	# Submit your predictions today and make your mark in advancing mobile AI technologies.
	# Show the world what your model can achieve!
	# </p>
	# <a href="#submission">Start Submitting</a>
	# </section>
	# """)

	# # Footer Section
	# gr.Markdown("""
	# <footer>
	# <h2>Stay Connected</h2>
	# <p>
	# Follow us on social media or contact us for any queries. Let's shape the future of AI together!
	# </p>
	# <div class="social-links">
	# <a href="https://twitter.com" target="_blank">Twitter</a>
	# <a href="https://linkedin.com" target="_blank">LinkedIn</a>
	# <a href="https://github.com" target="_blank">GitHub</a>
	# </div>
	# </footer>
	# """)

	# # Launch the interface
	# demo.launch()