xeon27
commited on
Commit
·
eb538cb
1
Parent(s):
a2f2df3
Change nomenclature to single-turn
Browse files- refactor_eval_results.py +1 -1
- src/about.py +18 -18
refactor_eval_results.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
|
4 |
|
5 |
METRIC_NAME = {
|
6 |
-
#
|
7 |
"arc_easy": "accuracy",
|
8 |
"arc_challenge": "accuracy",
|
9 |
"gpqa_diamond": "accuracy",
|
|
|
3 |
|
4 |
|
5 |
METRIC_NAME = {
|
6 |
+
# single-turn
|
7 |
"arc_easy": "accuracy",
|
8 |
"arc_challenge": "accuracy",
|
9 |
"gpqa_diamond": "accuracy",
|
src/about.py
CHANGED
@@ -15,21 +15,21 @@ class Task:
|
|
15 |
class Tasks(Enum):
|
16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
17 |
|
18 |
-
#
|
19 |
-
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "
|
20 |
-
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "
|
21 |
-
task2 = Task("drop", "mean", "DROP", "
|
22 |
-
task3 = Task("winogrande", "accuracy", "WinoGrande", "
|
23 |
-
task4 = Task("gsm8k", "accuracy", "GSM8K", "
|
24 |
-
task5 = Task("hellaswag", "accuracy", "HellaSwag", "
|
25 |
-
task6 = Task("humaneval", "mean", "HumanEval", "
|
26 |
-
task7 = Task("ifeval", "final_acc", "IFEval", "
|
27 |
-
task8 = Task("math", "accuracy", "MATH", "
|
28 |
-
task9 = Task("mmlu", "accuracy", "MMLU", "
|
29 |
-
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "
|
30 |
-
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "
|
31 |
-
task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "
|
32 |
-
task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "
|
33 |
|
34 |
# agentic
|
35 |
task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
@@ -44,19 +44,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
44 |
# Your leaderboard name
|
45 |
TITLE = """<h1 align="center" id="space-title">LLM Evaluation Leaderboard</h1>"""
|
46 |
|
47 |
-
|
48 |
AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
|
49 |
|
50 |
# What does your leaderboard evaluate?
|
51 |
INTRODUCTION_TEXT = f"""
|
52 |
-
This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories:
|
53 |
|
54 |
# Which evaluations are you running? how can people reproduce what you have?
|
55 |
LLM_BENCHMARKS_TEXT = f"""
|
56 |
## How it works
|
57 |
The following benchmarks are included:
|
58 |
|
59 |
-
|
60 |
|
61 |
Agentic: {AGENTIC_TASK_NAMES}
|
62 |
|
|
|
15 |
class Tasks(Enum):
|
16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
17 |
|
18 |
+
# single-turn
|
19 |
+
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
20 |
+
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
21 |
+
task2 = Task("drop", "mean", "DROP", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
|
22 |
+
task3 = Task("winogrande", "accuracy", "WinoGrande", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
|
23 |
+
task4 = Task("gsm8k", "accuracy", "GSM8K", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
|
24 |
+
task5 = Task("hellaswag", "accuracy", "HellaSwag", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
|
25 |
+
task6 = Task("humaneval", "mean", "HumanEval", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
|
26 |
+
task7 = Task("ifeval", "final_acc", "IFEval", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
|
27 |
+
task8 = Task("math", "accuracy", "MATH", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
|
28 |
+
task9 = Task("mmlu", "accuracy", "MMLU", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
|
29 |
+
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
|
30 |
+
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
|
31 |
+
task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
|
32 |
+
task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
|
33 |
|
34 |
# agentic
|
35 |
task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
|
|
44 |
# Your leaderboard name
|
45 |
TITLE = """<h1 align="center" id="space-title">LLM Evaluation Leaderboard</h1>"""
|
46 |
|
47 |
+
SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "single-turn"])
|
48 |
AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
|
49 |
|
50 |
# What does your leaderboard evaluate?
|
51 |
INTRODUCTION_TEXT = f"""
|
52 |
+
This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: single-turn and agentic. The single-turn tasks are: {SINGLE_TURN_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
|
53 |
|
54 |
# Which evaluations are you running? how can people reproduce what you have?
|
55 |
LLM_BENCHMARKS_TEXT = f"""
|
56 |
## How it works
|
57 |
The following benchmarks are included:
|
58 |
|
59 |
+
Single-turn: {SINGLE_TURN_TASK_NAMES}
|
60 |
|
61 |
Agentic: {AGENTIC_TASK_NAMES}
|
62 |
|