import pandas as pd import gradio as gr import csv import json import os import shutil from huggingface_hub import Repository HF_TOKEN = os.environ.get("HF_TOKEN") SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering", "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"] MODEL_INFO = [ "Models", "Overall", "Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering", "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"] DATA_TITLE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] SUBMISSION_NAME = "mmlu_pro_leaderboard_submission" SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME) CSV_DIR = "./mmlu_pro_leaderboard_submission/results.csv" COLUMN_NAMES = MODEL_INFO LEADERBOARD_INTRODUCTION = """# MMLU-Pro Leaderboard Welcome to the MMLU-Pro leaderboard, showcasing the performance of various advanced language models on the MMLU-Pro dataset. The MMLU-Pro dataset is an enhanced version of the original MMLU, specifically engineered to offer a more rigorous and realistic evaluation environment.. The MMLU-Pro dataset consists of approximately 12,000 intricate questions that challenge the comprehension and reasoning abilities of LLMs. Below you can find the accuracies of different models tested on this dataset. For detailed information about the dataset, visit our page on Hugging Face: MMLU-Pro at Hugging Face. If you are interested in replicating these results or wish to evaluate your models using our dataset, access our evaluation scripts available on GitHub: TIGER-AI-Lab/MMLU-Pro. """ TABLE_INTRODUCTION = """ """ LEADERBOARD_INFO = """ We list the information of the used datasets as follows:
""" CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""""" SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction ## ⚠ Please note that you need to submit the json file with following format: ```json { "Model": "[MODEL_NAME]", "Overall": 0.5678, "Biology": 0.1234, "Business": 0.4567, ..., "Other: 0.3456" } ``` After submitting, you can click the "Refresh" button to see the updated leaderboard (it may takes few seconds). """ def get_df(): repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN) repo.git_pull() df = pd.read_csv(CSV_DIR) df = df.sort_values(by=['Overall'], ascending=False) return df[COLUMN_NAMES] def add_new_eval( input_file, ): if input_file is None: return "Error! Empty file!" upload_data = json.loads(input_file) print("upload_data:\n", upload_data) data_row = [f'{upload_data["Model"]}', upload_data['Overall']] for subject in SUBJECTS: data_row += [upload_data[subject]] print("data_row:\n", data_row) submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset") submission_repo.git_pull() already_submitted = [] with open(CSV_DIR, mode='r') as file: reader = csv.reader(file, delimiter=',') for row in reader: already_submitted.append(row[0]) if data_row[0] not in already_submitted: with open(CSV_DIR, mode='a', newline='') as file: writer = csv.writer(file) writer.writerow(data_row) submission_repo.push_to_hub() print('Submission Successful') else: print('The entry already exists') def refresh_data(): return get_df()