import os import json import datetime import requests from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset, VerificationMode from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink TOKEN = os.environ.get("TOKEN", None) OWNER="gaia-benchmark" DATA_DATASET = f"{OWNER}/GAIA" INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal" SUBMISSION_DATASET = f"{OWNER}/submissions_internal" SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public" CONTACT_DATASET = f"{OWNER}/contact_info" RESULTS_DATASET = f"{OWNER}/results_public" LEADERBOARD_PATH = f"{OWNER}/leaderboard" api = HfApi() YEAR_VERSION = "2023" ref_scores_len = {"validation": 165, "test": 301} ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}} os.makedirs("scored", exist_ok=True) # Should be False on spaces and True outside LOCAL_DEBUG = not (os.environ.get("system") == "spaces") # Display the results eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True) contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True) def get_dataframe_from_results(eval_results, split): local_df = eval_results[split] local_df = row: {"model": model_hyperlink(row["url"], row["model"])}) local_df = local_df.remove_columns(["system_prompt", "url"]) local_df = local_df.rename_column("model", "Agent name") local_df = local_df.rename_column("model_family", "Model family") local_df = local_df.rename_column("score", "Average score (%)") for i in [1, 2, 3]: local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)") local_df = local_df.rename_column("date", "Submission date") df = pd.DataFrame(local_df) df = df.sort_values(by=["Average score (%)"], ascending=False) numeric_cols = [c for c in local_df.column_names if "score" in c] df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2) #df ="{:.2%}", subset=numeric_cols) return df eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation") eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") # Gold answers gold_results = {} gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", token=TOKEN, trust_remote_code=True) gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]} def restart_space(): api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"] def add_new_eval( val_or_test: str, model: str, model_family: str, system_prompt: str, url: str, path_to_file: str, organisation: str, mail: str, profile: gr.OAuthProfile, ): # Was the profile created less than 2 month ago? user_data = requests.get(f"{profile.username}/overview") creation_date = json.loads(user_data.content)["createdAt"] if - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60): return format_error("This account is not authorized to submit on GAIA.") contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True) user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username) if len(user_submission_dates) > 0 and user_submission_dates[-1] =='%Y-%m-%d'): return format_error("You already submitted once today, please try again tomorrow.") is_validation = val_or_test == "validation" # Very basic email parsing _, parsed_mail = parseaddr(mail) if not "@" in parsed_mail: return format_warning("Please provide a valid email adress.") print("Adding new eval") # Check if the combination model/org already exists and prints a warning message if yes if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]): return format_warning("This model has been already submitted.") if path_to_file is None: return format_warning("Please attach a file.") # SAVE UNSCORED SUBMISSION if LOCAL_DEBUG: print("mock uploaded submission") else: api.upload_file( repo_id=SUBMISSION_DATASET,, path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{}.jsonl", repo_type="dataset", token=TOKEN ) # SAVE CONTACT contact_info = { "model": model, "model_family": model_family, "url": url, "organisation": organisation, "username": profile.username, "mail": mail, "date":'%Y-%m-%d') } contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info) if LOCAL_DEBUG: print("mock uploaded contact info") else: contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN) # SCORE SUBMISSION file_path = scores = {"all": 0, 1: 0, 2: 0, 3: 0} num_questions = {"all": 0, 1: 0, 2: 0, 3: 0} task_ids = [] with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: with open(file_path, 'r') as f: for ix, line in enumerate(f): try: task = json.loads(line) except Exception: return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.") if "model_answer" not in task: return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.") answer = task["model_answer"] task_id = task["task_id"] try: level = int(gold_results[val_or_test][task_id]["Level"]) except KeyError: return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?") score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"]) scored_file.write( json.dumps({ "id": task_id, "model_answer": answer, "score": score, "level": level }) + "\n" ) task_ids.append(task_id) scores["all"] += score scores[level] += score num_questions["all"] += 1 num_questions[level] += 1 # Check if there's any duplicate in the submission if len(task_ids) != len(set(task_ids)): return format_error("There are duplicates in your submission. Please check your file and resubmit it.") if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]): return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.") # SAVE SCORED SUBMISSION if LOCAL_DEBUG: print("mock uploaded scored submission") else: api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=f"scored/{organisation}_{model}.jsonl", path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{}.jsonl", repo_type="dataset", token=TOKEN ) # Save scored file if is_validation: api.upload_file( repo_id=SUBMISSION_DATASET_PUBLIC, path_or_fileobj=f"scored/{organisation}_{model}.jsonl", path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{}.jsonl", repo_type="dataset", token=TOKEN ) # SAVE TO LEADERBOARD DATA eval_entry = { "model": model, "model_family": model_family, "system_prompt": system_prompt, "url": url, "organisation": organisation, "score": scores["all"]/ref_scores_len[val_or_test], "score_level1": scores[1]/num_questions[1], "score_level2": scores[2]/num_questions[2], "score_level3": scores[3]/num_questions[3], "date":'%Y-%m-%d') } if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]: return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.") # Catching spam submissions of 100% if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])): return format_error(f"There was a problem with your submission. Please open a discussion.") # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"} #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"] #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date): # return format_error(f"Your submission is an exact duplicate from an existing submission.") eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry) print(eval_results) if LOCAL_DEBUG: print("mock uploaded results to lb") else: eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN) return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.") def refresh(): eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS,trust_remote_code=True) eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation") eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") return eval_dataframe_val, eval_dataframe_test def upload_file(files): file_paths = [ for file in files] return file_paths demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", ) with gr.Tab("Results: Test"): leaderboard_table_test = gr.components.Dataframe( value=eval_dataframe_test, datatype=TYPES, interactive=False, column_widths=["20%"] ) with gr.Tab("Results: Validation"): leaderboard_table_val = gr.components.Dataframe( value=eval_dataframe_val, datatype=TYPES, interactive=False, column_widths=["20%"] ) refresh_button = gr.Button("Refresh") refresh, inputs=[], outputs=[ leaderboard_table_val, leaderboard_table_test, ], ) with gr.Accordion("Submit a new model for evaluation"): with gr.Row(): gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Column(): level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split") model_name_textbox = gr.Textbox(label="Agent name") model_family_textbox = gr.Textbox(label="Model family") system_prompt_textbox = gr.Textbox(label="System prompt example") url_textbox = gr.Textbox(label="Url to model information") with gr.Column(): organisation = gr.Textbox(label="Organisation") mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)") file_output = gr.File() with gr.Row(): gr.LoginButton() submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() add_new_eval, [ level_of_test, model_name_textbox, model_family_textbox, system_prompt_textbox, url_textbox, file_output, organisation, mail ], submission_result, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.launch(debug=True)