Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Sleeping

App Files Files Community

Mobile-MMLU-Challenge / app.py

SondosMB

Update app.py

e359f0e verified 11 months ago

raw

history blame

9.53 kB

	import gradio as gr
	import pandas as pd
	import os
	import re
	from datetime import datetime
	from huggingface_hub import hf_hub_download
	from huggingface_hub import HfApi, HfFolder

	LEADERBOARD_FILE = "leaderboard.csv"
	GROUND_TRUTH_FILE = "ground_truth.csv"
	LAST_UPDATED = datetime.now().strftime("%B %d, %Y")

	# Ensure authentication and suppress warnings
	os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
	HF_TOKEN = os.getenv("HF_TOKEN")
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN environment variable is not set or invalid.")

	def initialize_leaderboard_file():
	"""
	Ensure the leaderboard file exists and has the correct headers.
	"""
	if not os.path.exists(LEADERBOARD_FILE):
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)
	elif os.stat(LEADERBOARD_FILE).st_size == 0:
	pd.DataFrame(columns=[
	"Model Name", "Overall Accuracy", "Valid Accuracy",
	"Correct Predictions", "Total Questions", "Timestamp"
	]).to_csv(LEADERBOARD_FILE, index=False)

	def clean_answer(answer):
	if pd.isna(answer):
	return None
	answer = str(answer)
	clean = re.sub(r'[^A-Da-d]', '', answer)
	return clean[0].upper() if clean else None


	def update_leaderboard(results):
	"""
	Append new submission results to the leaderboard file and push updates to the Hugging Face repository.
	"""
	new_entry = {
	"Model Name": results['model_name'],
	"Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
	"Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
	"Correct Predictions": results['correct_predictions'],
	"Total Questions": results['total_questions'],
	"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	}

	try:
	# Update the local leaderboard file
	new_entry_df = pd.DataFrame([new_entry])
	file_exists = os.path.exists(LEADERBOARD_FILE)

	new_entry_df.to_csv(
	LEADERBOARD_FILE,
	mode='a', # Append mode
	index=False,
	header=not file_exists # Write header only if the file is new
	)
	print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}")

	# Push the updated file to the Hugging Face repository using HTTP API
	api = HfApi()
	token = HfFolder.get_token()

	api.upload_file(
	path_or_fileobj=LEADERBOARD_FILE,
	path_in_repo="leaderboard.csv",
	repo_id="SondosMB/ss", # Your Space repository
	repo_type="space",
	token=token
	)
	print("Leaderboard changes pushed to Hugging Face repository.")

	except Exception as e:
	print(f"Error updating leaderboard file: {e}")



	def load_leaderboard():
	if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0:
	return pd.DataFrame({
	"Model Name": [],
	"Overall Accuracy": [],
	"Valid Accuracy": [],
	"Correct Predictions": [],
	"Total Questions": [],
	"Timestamp": [],
	})
	return pd.read_csv(LEADERBOARD_FILE)

	def evaluate_predictions(prediction_file, model_name, add_to_leaderboard):
	try:
	ground_truth_path = hf_hub_download(
	repo_id="SondosMB/ground-truth-dataset",
	filename="ground_truth.csv",
	repo_type="dataset",
	use_auth_token=True
	)
	ground_truth_df = pd.read_csv(ground_truth_path)
	except FileNotFoundError:
	return "Ground truth file not found in the dataset repository.", load_leaderboard()
	except Exception as e:
	return f"Error loading ground truth: {e}", load_leaderboard()

	if not prediction_file:
	return "Prediction file not uploaded.", load_leaderboard()

	try:
	predictions_df = pd.read_csv(prediction_file.name)
	merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
	merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)

	valid_predictions = merged_df.dropna(subset=['pred_answer'])
	correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
	total_predictions = len(merged_df)
	total_valid_predictions = len(valid_predictions)

	overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
	valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0

	results = {
	'model_name': model_name if model_name else "Unknown Model",
	'overall_accuracy': overall_accuracy,
	'valid_accuracy': valid_accuracy,
	'correct_predictions': correct_predictions,
	'total_questions': total_predictions,
	}

	if add_to_leaderboard:
	update_leaderboard(results)
	return "Evaluation completed and added to leaderboard.", load_leaderboard()
	else:
	return "Evaluation completed but not added to leaderboard.", load_leaderboard()

	except Exception as e:
	return f"Error during evaluation: {str(e)}", load_leaderboard()

	initialize_leaderboard_file()

	# Function to set default mode
	# Function to set default mode
	import gradio as gr

	# Ensure CSS is correctly defined
	css_tech_theme = """
	body {
	background-color: #f4f6fa;
	color: #333333;
	font-family: 'Roboto', sans-serif;
	line-height: 1.8;
	}

	.center-content {
	display: flex;
	flex-direction: column;
	align-items: center;
	justify-content: center;
	text-align: center;
	margin: 30px 0;
	padding: 20px;
	}

	h1, h3 {
	color: #5e35b1;
	margin: 15px 0;
	text-align: center;
	}
	"""

	# Ensure all required functions and variables are defined
	def evaluate_predictions(file, model_name, add_to_leaderboard):
	# Add logic for evaluating predictions
	return "Evaluation completed", 90.0 # Example return

	def load_leaderboard():
	# Add logic for loading leaderboard
	return [{"Model Name": "Example", "Accuracy": 90}]

	LAST_UPDATED = "December 21, 2024"

	# Create the Gradio Interface
	with gr.Blocks(css=css_tech_theme) as demo:
	gr.Markdown("""
	<div class="center-content">
	<h1>🏆 Mobile-MMLU Benchmark Competition</h1>
	<h3>🌟 Welcome to the Competition Overview</h3>
	<img src="https://via.placeholder.com/200" alt="Competition Logo">
	<p>
	Welcome to the Mobile-MMLU Benchmark Competition. Here you can submit your predictions,
	view the leaderboard, and track your performance!
	</p>
	<hr>
	</div>
	""")

	with gr.Tabs(elem_id="tabs"):
	with gr.TabItem("📖 Overview"):
	gr.Markdown("""
	## Overview
	Welcome to the Mobile-MMLU Benchmark Competition! Evaluate mobile-compatible Large Language Models (LLMs) on 16,186 scenario-based and factual questions across 80 fields.
	---
	### What is Mobile-MMLU?
	Mobile-MMLU is a benchmark designed to test the capabilities of LLMs optimized for mobile use. Contribute to advancing mobile AI systems by competing to achieve the highest accuracy.
	---
	### How It Works
	1. Download the Dataset
	Access the dataset and instructions on our [GitHub page](https://github.com/your-github-repo).
	2. Generate Predictions
	Use your LLM to answer the dataset questions. Format your predictions as a CSV file.
	3. Submit Predictions
	Upload your predictions on this platform.
	4. Evaluation
	Submissions are scored on accuracy.
	5. Leaderboard
	View real-time rankings on the leaderboard.
	---
	""")

	with gr.TabItem("📤 Submission"):
	with gr.Row():
	file_input = gr.File(label="📂 Upload Prediction CSV", file_types=[".csv"], interactive=True)
	model_name_input = gr.Textbox(label="🖋️ Model Name", placeholder="Enter your model name")

	with gr.Row():
	overall_accuracy_display = gr.Number(label="🏅 Overall Accuracy", interactive=False)
	add_to_leaderboard_checkbox = gr.Checkbox(label="📊 Add to Leaderboard?", value=True)

	eval_button = gr.Button("Evaluate", elem_id="evaluate-button")
	eval_status = gr.Textbox(label="📢 Evaluation Status", interactive=False)

	eval_button.click(
	evaluate_predictions,
	inputs=[file_input, model_name_input, add_to_leaderboard_checkbox],
	outputs=[eval_status, overall_accuracy_display],
	)

	with gr.TabItem("🏅 Leaderboard"):
	leaderboard_table = gr.Dataframe(
	value=load_leaderboard(),
	label="Leaderboard",
	interactive=False,
	wrap=True,
	)
	refresh_button = gr.Button("Refresh Leaderboard")
	refresh_button.click(
	lambda: load_leaderboard(),
	inputs=[],
	outputs=[leaderboard_table],
	)

	gr.Markdown(f"Last updated: {LAST_UPDATED}")

	demo.launch()