eval-leaderboard

Running

App Files Files Community

xeon27 commited on Jan 24

Commit

eb538cb

1 Parent(s): a2f2df3

Change nomenclature to single-turn

Browse files

Files changed (2) hide show

refactor_eval_results.py +1 -1
src/about.py +18 -18

refactor_eval_results.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 METRIC_NAME = {
-    # base
     "arc_easy": "accuracy",
     "arc_challenge": "accuracy",
     "gpqa_diamond": "accuracy",

 METRIC_NAME = {
+    # single-turn
     "arc_easy": "accuracy",
     "arc_challenge": "accuracy",
     "gpqa_diamond": "accuracy",

src/about.py CHANGED Viewed

@@ -15,21 +15,21 @@ class Task:
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    # base
-    task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
-    task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
-    task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
-    task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
-    task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
-    task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
-    task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
-    task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
-    task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
-    task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
-    task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
-    task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
-    task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
-    task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
     # agentic
     task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
@@ -44,19 +44,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">LLM Evaluation Leaderboard</h1>"""
-BASE_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
 AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = f"""
-This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: base and agentic. The base tasks are: {BASE_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 The following benchmarks are included:
-Base: {BASE_TASK_NAMES}
 Agentic: {AGENTIC_TASK_NAMES}

 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # single-turn
+    task0 = Task("arc_easy", "accuracy", "ARC-Easy", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
+    task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
+    task2 = Task("drop", "mean", "DROP", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
+    task3 = Task("winogrande", "accuracy", "WinoGrande", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
+    task4 = Task("gsm8k", "accuracy", "GSM8K", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
+    task5 = Task("hellaswag", "accuracy", "HellaSwag", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
+    task6 = Task("humaneval", "mean", "HumanEval", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
+    task7 = Task("ifeval", "final_acc", "IFEval", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
+    task8 = Task("math", "accuracy", "MATH", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
+    task9 = Task("mmlu", "accuracy", "MMLU", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
+    task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
+    task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
+    task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
+    task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
     # agentic
     task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">LLM Evaluation Leaderboard</h1>"""
+SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "single-turn"])
 AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = f"""
+This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: single-turn and agentic. The single-turn tasks are: {SINGLE_TURN_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 The following benchmarks are included:
+Single-turn: {SINGLE_TURN_TASK_NAMES}
 Agentic: {AGENTIC_TASK_NAMES}