Spaces:
Sleeping
Sleeping
xeon27
commited on
Commit
·
9c55d6d
1
Parent(s):
84a3b7a
Add model name links and change single-turn to base
Browse files- app.py +2 -2
- refactor_eval_results.py +1 -0
- src/about.py +18 -18
- src/display/formatting.py +2 -3
- src/populate.py +2 -2
app.py
CHANGED
|
@@ -78,8 +78,8 @@ with demo:
|
|
| 78 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 79 |
|
| 80 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 81 |
-
with gr.TabItem("
|
| 82 |
-
leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "
|
| 83 |
|
| 84 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
| 85 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
|
|
|
|
| 78 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 79 |
|
| 80 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 81 |
+
with gr.TabItem("Base Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 82 |
+
leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
|
| 83 |
|
| 84 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
| 85 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
|
refactor_eval_results.py
CHANGED
|
@@ -106,6 +106,7 @@ def main():
|
|
| 106 |
# Create dummy requests file
|
| 107 |
requests = {
|
| 108 |
"model": model_name,
|
|
|
|
| 109 |
"base_model": "",
|
| 110 |
"revision": "main",
|
| 111 |
"private": False,
|
|
|
|
| 106 |
# Create dummy requests file
|
| 107 |
requests = {
|
| 108 |
"model": model_name,
|
| 109 |
+
"model_sha": MODEL_SHA_MAP[model_name],
|
| 110 |
"base_model": "",
|
| 111 |
"revision": "main",
|
| 112 |
"private": False,
|
src/about.py
CHANGED
|
@@ -15,21 +15,21 @@ class Task:
|
|
| 15 |
class Tasks(Enum):
|
| 16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "
|
| 20 |
-
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "
|
| 21 |
-
task2 = Task("drop", "mean", "DROP", "
|
| 22 |
-
task3 = Task("winogrande", "accuracy", "WinoGrande", "
|
| 23 |
-
task4 = Task("gsm8k", "accuracy", "GSM8K", "
|
| 24 |
-
task5 = Task("hellaswag", "accuracy", "HellaSwag", "
|
| 25 |
-
task6 = Task("humaneval", "mean", "HumanEval", "
|
| 26 |
-
task7 = Task("ifeval", "final_acc", "IFEval", "
|
| 27 |
-
task8 = Task("math", "accuracy", "MATH", "
|
| 28 |
-
task9 = Task("mmlu", "accuracy", "MMLU", "
|
| 29 |
-
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "
|
| 30 |
-
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "
|
| 31 |
-
task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "
|
| 32 |
-
task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "
|
| 33 |
|
| 34 |
# agentic
|
| 35 |
task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
|
@@ -44,19 +44,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
| 44 |
# Your leaderboard name
|
| 45 |
TITLE = """<h1 align="center" id="space-title">Vector State of Evaluation Leaderboard</h1>"""
|
| 46 |
|
| 47 |
-
SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "
|
| 48 |
AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
|
| 49 |
|
| 50 |
# What does your leaderboard evaluate?
|
| 51 |
INTRODUCTION_TEXT = f"""
|
| 52 |
-
This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories:
|
| 53 |
|
| 54 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 55 |
LLM_BENCHMARKS_TEXT = f"""
|
| 56 |
## How it works
|
| 57 |
The following benchmarks are included:
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
Agentic: {AGENTIC_TASK_NAMES}
|
| 62 |
|
|
|
|
| 15 |
class Tasks(Enum):
|
| 16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 17 |
|
| 18 |
+
# base
|
| 19 |
+
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
| 20 |
+
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
| 21 |
+
task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
|
| 22 |
+
task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
|
| 23 |
+
task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
|
| 24 |
+
task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
|
| 25 |
+
task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
|
| 26 |
+
task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
|
| 27 |
+
task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
|
| 28 |
+
task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
|
| 29 |
+
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
|
| 30 |
+
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
|
| 31 |
+
task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
|
| 32 |
+
task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
|
| 33 |
|
| 34 |
# agentic
|
| 35 |
task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
|
|
|
| 44 |
# Your leaderboard name
|
| 45 |
TITLE = """<h1 align="center" id="space-title">Vector State of Evaluation Leaderboard</h1>"""
|
| 46 |
|
| 47 |
+
SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
|
| 48 |
AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
|
| 49 |
|
| 50 |
# What does your leaderboard evaluate?
|
| 51 |
INTRODUCTION_TEXT = f"""
|
| 52 |
+
This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: base and agentic. The base tasks are: {SINGLE_TURN_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
|
| 53 |
|
| 54 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 55 |
LLM_BENCHMARKS_TEXT = f"""
|
| 56 |
## How it works
|
| 57 |
The following benchmarks are included:
|
| 58 |
|
| 59 |
+
Base: {SINGLE_TURN_TASK_NAMES}
|
| 60 |
|
| 61 |
Agentic: {AGENTIC_TASK_NAMES}
|
| 62 |
|
src/display/formatting.py
CHANGED
|
@@ -2,9 +2,8 @@ def model_hyperlink(link, model_name):
|
|
| 2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 3 |
|
| 4 |
|
| 5 |
-
def make_clickable_model(model_name):
|
| 6 |
-
|
| 7 |
-
return model_hyperlink(link, model_name)
|
| 8 |
|
| 9 |
|
| 10 |
def styled_error(error):
|
|
|
|
| 2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 3 |
|
| 4 |
|
| 5 |
+
def make_clickable_model(model_name, model_sha):
|
| 6 |
+
return model_hyperlink(model_sha, model_name)
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def styled_error(error):
|
src/populate.py
CHANGED
|
@@ -66,7 +66,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
| 66 |
with open(file_path) as fp:
|
| 67 |
data = json.load(fp)
|
| 68 |
|
| 69 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["
|
| 70 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 71 |
|
| 72 |
all_evals.append(data)
|
|
@@ -78,7 +78,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
| 78 |
with open(file_path) as fp:
|
| 79 |
data = json.load(fp)
|
| 80 |
|
| 81 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["
|
| 82 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 83 |
all_evals.append(data)
|
| 84 |
|
|
|
|
| 66 |
with open(file_path) as fp:
|
| 67 |
data = json.load(fp)
|
| 68 |
|
| 69 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"], data["model_sha"])
|
| 70 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 71 |
|
| 72 |
all_evals.append(data)
|
|
|
|
| 78 |
with open(file_path) as fp:
|
| 79 |
data = json.load(fp)
|
| 80 |
|
| 81 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"], data["model_sha"])
|
| 82 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
| 83 |
all_evals.append(data)
|
| 84 |
|