from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str type: str source: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard # base task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc") task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc") task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop") task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande") task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k") task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag") task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval") task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval") task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics") task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu") task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro") task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa") task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu") task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu") # agentic task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia") task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """