import gradio as gr import pandas as pd import os import re from datetime import datetime from huggingface_hub import hf_hub_download from huggingface_hub import HfApi, HfFolder LEADERBOARD_FILE = "leaderboard.csv" GROUND_TRUTH_FILE = "ground_truth.csv" LAST_UPDATED = datetime.now().strftime("%B %d, %Y") # Ensure authentication and suppress warnings os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError("HF_TOKEN environment variable is not set or invalid.") def initialize_leaderboard_file(): """ Ensure the leaderboard file exists and has the correct headers. """ if not os.path.exists(LEADERBOARD_FILE): pd.DataFrame(columns=[ "Model Name", "Overall Accuracy", "Valid Accuracy", "Correct Predictions", "Total Questions", "Timestamp" ]).to_csv(LEADERBOARD_FILE, index=False) elif os.stat(LEADERBOARD_FILE).st_size == 0: pd.DataFrame(columns=[ "Model Name", "Overall Accuracy", "Valid Accuracy", "Correct Predictions", "Total Questions", "Timestamp" ]).to_csv(LEADERBOARD_FILE, index=False) def clean_answer(answer): if pd.isna(answer): return None answer = str(answer) clean = re.sub(r'[^A-Da-d]', '', answer) return clean[0].upper() if clean else None def update_leaderboard(results): """ Append new submission results to the leaderboard file and push updates to the Hugging Face repository. """ new_entry = { "Model Name": results['model_name'], "Overall Accuracy": round(results['overall_accuracy'] * 100, 2), "Valid Accuracy": round(results['valid_accuracy'] * 100, 2), "Correct Predictions": results['correct_predictions'], "Total Questions": results['total_questions'], "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } try: # Update the local leaderboard file new_entry_df = pd.DataFrame([new_entry]) file_exists = os.path.exists(LEADERBOARD_FILE) new_entry_df.to_csv( LEADERBOARD_FILE, mode='a', # Append mode index=False, header=not file_exists # Write header only if the file is new ) print(f"Leaderboard updated successfully at {LEADERBOARD_FILE}") # Push the updated file to the Hugging Face repository using HTTP API api = HfApi() token = HfFolder.get_token() api.upload_file( path_or_fileobj=LEADERBOARD_FILE, path_in_repo="leaderboard.csv", repo_id="SondosMB/ss", # Your Space repository repo_type="space", token=token ) print("Leaderboard changes pushed to Hugging Face repository.") except Exception as e: print(f"Error updating leaderboard file: {e}") def load_leaderboard(): if not os.path.exists(LEADERBOARD_FILE) or os.stat(LEADERBOARD_FILE).st_size == 0: return pd.DataFrame({ "Model Name": [], "Overall Accuracy": [], "Valid Accuracy": [], "Correct Predictions": [], "Total Questions": [], "Timestamp": [], }) return pd.read_csv(LEADERBOARD_FILE) def evaluate_predictions(prediction_file, model_name, add_to_leaderboard): try: ground_truth_path = hf_hub_download( repo_id="SondosMB/ground-truth-dataset", filename="ground_truth.csv", repo_type="dataset", use_auth_token=True ) ground_truth_df = pd.read_csv(ground_truth_path) except FileNotFoundError: return "Ground truth file not found in the dataset repository.", load_leaderboard() except Exception as e: return f"Error loading ground truth: {e}", load_leaderboard() if not prediction_file: return "Prediction file not uploaded.", load_leaderboard() try: #load predition file predictions_df = pd.read_csv(prediction_file.name) # Validate required columns in prediction file required_columns = ['question_id', 'predicted_answer'] missing_columns = [col for col in required_columns if col not in predictions_df.columns] if missing_columns: return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.", load_leaderboard()) # Validate 'Answer' column in ground truth file if 'Answer' not in ground_truth_df.columns: return "Error: 'Answer' column is missing in the ground truth dataset.", load_leaderboard() merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner') merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer) valid_predictions = merged_df.dropna(subset=['pred_answer']) correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum() total_predictions = len(merged_df) total_valid_predictions = len(valid_predictions) overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0 valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0 results = { 'model_name': model_name if model_name else "Unknown Model", 'overall_accuracy': overall_accuracy, 'valid_accuracy': valid_accuracy, 'correct_predictions': correct_predictions, 'total_questions': total_predictions, } if add_to_leaderboard: update_leaderboard(results) return "Evaluation completed and added to leaderboard.", load_leaderboard() else: return "Evaluation completed but not added to leaderboard.", load_leaderboard() except Exception as e: return f"Error during evaluation: {str(e)}", load_leaderboard() initialize_leaderboard_file() # Function to set default mode # Function to set default mode import gradio as gr # # Custom CSS to match website style # # Define CSS to match a modern, professional design # # Define enhanced CSS for the entire layout css_tech_theme = """ body { font-family: 'Roboto', sans-serif; background-color: #f4f6fa; color: #333333; margin: 0; padding: 0; } /* Header Styling */ header { text-align: center; padding: 60px 20px; background: linear-gradient(135deg, #6a1b9a, #64b5f6); color: #ffffff; border-radius: 12px; margin-bottom: 30px; box-shadow: 0 6px 20px rgba(0, 0, 0, 0.2); } header h1 { font-size: 3.5em; font-weight: bold; margin-bottom: 10px; } header h2 { font-size: 2em; margin-bottom: 15px; } header p { font-size: 1em; line-height: 1.8; } .header-buttons { display: flex; justify-content: center; gap: 15px; margin-top: 20px; } .header-buttons a { text-decoration: none; font-size: 1.5em; padding: 15px 30px; border-radius: 30px; font-weight: bold; background: #ffffff; color: #6a1b9a; transition: transform 0.3s, background 0.3s; box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1); } .header-buttons a:hover { background: #64b5f6; color: #ffffff; transform: scale(1.05); } /* Pre-Tabs Section */ .pre-tabs { text-align: center; padding: 40px 20px; background: linear-gradient(135deg, #ffffff, #f9fafb); border-top: 5px solid #64b5f6; border-bottom: 5px solid #6a1b9a; } .pre-tabs h2, .post-tabs h2 { font-size: 3em; /* Increase the size for better visibility */ } .pre-tabs p, .post-tabs p { font-size: 2.5em; /* Adjust paragraph text size */ } .pre-tabs h2 { color: #333333; margin-bottom: 15px; } .pre-tabs p { color: #555555; line-height: 1.8; } /* Tabs Section */ .tabs { margin: 0 auto; padding: 20px; background: #ffffff; border-radius: 12px; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); /* max-width: 1300px; /* change 1 */ */ } /* Post-Tabs Section */ .post-tabs { text-align: center; padding: 40px 20px; background: linear-gradient(135deg, #64b5f6, #6a1b9a); color: #ffffff; border-radius: 12px; margin-top: 30px; } .post-tabs h2 { font-size: 3.4em; margin-bottom: 15px; } .post-tabs p { font-size: 2em; line-height: 1.8; margin-bottom: 20px; } .post-tabs a { text-decoration: none; font-size: 1.1em; padding: 15px 30px; border-radius: 30px; font-weight: bold; background: #ffffff; color: #6a1b9a; transition: transform 0.3s, background 0.3s; box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1); } .post-tabs a:hover { background: #6a1b9a; color: #ffffff; transform: scale(1.05); } /* Footer */ footer { background: linear-gradient(135deg, #6a1b9a, #8e44ad); color: #ffffff; text-align: center; padding: 40px 20px; margin-top: 30px; border-radius: 12px; box-shadow: 0 4px 10px rgba(0, 0, 0, 0.2); } footer h2 { font-size: 1.5em; margin-bottom: 15px; } footer p { font-size: 0.8em; line-height: 1.6; margin-bottom: 20px; } /* Link Styling */ .social-links { display: flex; justify-content: center; gap: 15px; /* Space between links */ } .social-link { display: inline-block; text-decoration: none; color: #ffffff; background-color: #6a1b9a; /* Purple button background */ padding: 10px 20px; border-radius: 30px; font-size: 16px; font-weight: bold; transition: all 0.3s ease; box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1); } .social-link:hover { background-color: #8c52d3; /* Darker shade on hover */ box-shadow: 0 6px 15px rgba(0, 0, 0, 0.2); transform: translateY(-2px); } .social-link:active { transform: translateY(1px); box-shadow: 0 3px 8px rgba(0, 0, 0, 0.1); } /* Submission Section Styling */ .submission-section { margin: 40px auto; padding: 30px; background: linear-gradient(135deg, #ffffff, #f9f9ff); border-radius: 12px; box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); max-width: 800px; text-align: center; } .submission-section h2 { font-size: 2.5em; color: #6a1b9a; margin-bottom: 20px; font-weight: bold; } .submission-section p { font-size: 1.2em; color: #333; margin-bottom: 30px; } #submission-fields { display: flex; flex-direction: column; gap: 20px; align-items: center; } #submission-fields input[type="file"], #submission-fields input[type="text"] { width: 90%; max-width: 400px; padding: 12px 15px; font-size: 1em; border: 2px solid #d3bce8; border-radius: 8px; box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.05); transition: border-color 0.3s ease; } #submission-fields input[type="file"]:focus, #submission-fields input[type="text"]:focus { border-color: #6a1b9a; outline: none; box-shadow: 0 0 5px rgba(106, 27, 154, 0.4); } #submission-results { margin-top: 20px; text-align: center; } #submission-buttons { display: flex; justify-content: center; gap: 15px; margin-top: 20px; } #submission-buttons button { padding: 10px 20px; font-size: 1em; color: #ffffff; background: #6a1b9a; border: none; border-radius: 30px; cursor: pointer; font-weight: bold; transition: background 0.3s ease, transform 0.3s ease; box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1); } #submission-buttons button:hover { background: #8c52d3; transform: scale(1.05); } #submission-buttons button:active { background: #5e1287; transform: scale(0.98); } """ # Create the Gradio Interface with gr.Blocks(css=css_tech_theme) as demo: # Header Section gr.Markdown("""

πŸ† Mobile-MMLU Challenge

πŸš€ Pushing the Limits of Mobile LLMs

""") # # Pre-Tabs Section gr.Markdown("""

Why Participate?

The Mobile-MMLU Benchmark Competition offers a unique opportunity to evaluate your LLMs in real-world mobile scenarios. Join the challenge to drive innovation, showcase your expertise, and shape the future of mobile AI.

""") # Tabs Section with gr.Tabs(elem_id="tabs"): # Overview Tab with gr.TabItem("πŸ“– Overview"): gr.Markdown("""

About the Competition

The Mobile-MMLU Benchmark Competition is a premier challenge designed to evaluate and advance mobile-optimized Large Language Models (LLMs). It provides an unparalleled opportunity to showcase your model's ability to handle diverse, real-world scenarios while pushing the boundaries of mobile intelligence.

With a dataset spanning 80 distinct fields and featuring 16,186 questions, this competition emphasizes practical application. From education and healthcare to technology and daily life, the questions are crafted to mimic real-world challenges and test the adaptability, accuracy, and efficiency of mobile-compatible LLMs.

Why Compete?

Participating in this competition allows you to:

How It Works

""") with gr.TabItem("πŸ“€ Submission"): gr.Markdown("""

Submit Your Predictions

Upload your prediction file and provide your model name to evaluate and submit to the leaderboard.

""") with gr.Row(elem_id="submission-fields"): file_input = gr.File(label="Upload Prediction CSV", file_types=[".csv"], interactive=True) model_name_input = gr.Textbox(label="Model Name", placeholder="Enter your model name") with gr.Row(elem_id="submission-results"): overall_accuracy_display = gr.Number(label="Overall Accuracy", interactive=False) with gr.Row(elem_id="submission-buttons"): eval_button = gr.Button("Evaluate") submit_button = gr.Button("Prove and Submit to Leaderboard", visible=False) eval_status = gr.Textbox(label="Evaluation Status", interactive=False) # Define the functions outside the `with` block def handle_evaluation(file, model_name): # Check if required inputs are provided if not file: return "Error: Please upload a prediction file.", 0, gr.update(visible=False) if not model_name or model_name.strip() == "": return "Error: Please enter a model name.", 0, gr.update(visible=False) try: # Load predictions file predictions_df = pd.read_csv(file.name) # Validate required columns in the prediction file required_columns = ['question_id', 'predicted_answer'] missing_columns = [col for col in required_columns if col not in predictions_df.columns] if missing_columns: return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.", 0, gr.update(visible=False)) # Perform evaluation status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard=False) if leaderboard.empty: overall_accuracy = 0 else: overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"] # Show the submit button after successful evaluation return status, overall_accuracy, gr.update(visible=True) except Exception as e: # Handle unexpected errors return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False) def handle_submission(file, model_name): # Handle leaderboard submission status, _ = evaluate_predictions(file, model_name, add_to_leaderboard=True) return f"Submission to leaderboard completed: {status}" # Connect button clicks to the functions eval_button.click( handle_evaluation, inputs=[file_input, model_name_input], outputs=[eval_status, overall_accuracy_display, submit_button], ) submit_button.click( handle_submission, inputs=[file_input, model_name_input], outputs=[eval_status], ) with gr.TabItem("πŸ… Leaderboard"): leaderboard_table = gr.Dataframe( value=load_leaderboard(), label="Leaderboard", interactive=False, wrap=True, ) refresh_button = gr.Button("Refresh Leaderboard") refresh_button.click( lambda: load_leaderboard(), inputs=[], outputs=[leaderboard_table], ) # Post-Tabs Section gr.Markdown("""

Ready to Compete?

Submit your predictions today and make your mark in advancing mobile AI technologies. Show the world what your model can achieve!

""") # Footer Section gr.Markdown(""" """) demo.launch()