xeon27 commited on
Commit
9c55d6d
·
1 Parent(s): 84a3b7a

Add model name links and change single-turn to base

Browse files
app.py CHANGED
@@ -78,8 +78,8 @@ with demo:
78
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
79
 
80
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
81
- with gr.TabItem("Single-turn Benchmark", elem_id="llm-benchmark-tab-table", id=0):
82
- leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "single-turn")
83
 
84
  with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
85
  leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
 
78
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
79
 
80
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
81
+ with gr.TabItem("Base Benchmark", elem_id="llm-benchmark-tab-table", id=0):
82
+ leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
83
 
84
  with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
85
  leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
refactor_eval_results.py CHANGED
@@ -106,6 +106,7 @@ def main():
106
  # Create dummy requests file
107
  requests = {
108
  "model": model_name,
 
109
  "base_model": "",
110
  "revision": "main",
111
  "private": False,
 
106
  # Create dummy requests file
107
  requests = {
108
  "model": model_name,
109
+ "model_sha": MODEL_SHA_MAP[model_name],
110
  "base_model": "",
111
  "revision": "main",
112
  "private": False,
src/about.py CHANGED
@@ -15,21 +15,21 @@ class Task:
15
  class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
 
18
- # single-turn
19
- task0 = Task("arc_easy", "accuracy", "ARC-Easy", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
20
- task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
21
- task2 = Task("drop", "mean", "DROP", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
22
- task3 = Task("winogrande", "accuracy", "WinoGrande", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
23
- task4 = Task("gsm8k", "accuracy", "GSM8K", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
24
- task5 = Task("hellaswag", "accuracy", "HellaSwag", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
25
- task6 = Task("humaneval", "mean", "HumanEval", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
26
- task7 = Task("ifeval", "final_acc", "IFEval", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
27
- task8 = Task("math", "accuracy", "MATH", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
28
- task9 = Task("mmlu", "accuracy", "MMLU", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
29
- task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
30
- task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
31
- task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
32
- task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "single-turn", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
33
 
34
  # agentic
35
  task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
@@ -44,19 +44,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
44
  # Your leaderboard name
45
  TITLE = """<h1 align="center" id="space-title">Vector State of Evaluation Leaderboard</h1>"""
46
 
47
- SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "single-turn"])
48
  AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
49
 
50
  # What does your leaderboard evaluate?
51
  INTRODUCTION_TEXT = f"""
52
- This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: single-turn and agentic. The single-turn tasks are: {SINGLE_TURN_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
53
 
54
  # Which evaluations are you running? how can people reproduce what you have?
55
  LLM_BENCHMARKS_TEXT = f"""
56
  ## How it works
57
  The following benchmarks are included:
58
 
59
- Single-turn: {SINGLE_TURN_TASK_NAMES}
60
 
61
  Agentic: {AGENTIC_TASK_NAMES}
62
 
 
15
  class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
 
18
+ # base
19
+ task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
20
+ task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
21
+ task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
22
+ task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
23
+ task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
24
+ task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
25
+ task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
26
+ task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
27
+ task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
28
+ task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
29
+ task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
30
+ task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
31
+ task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
32
+ task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
33
 
34
  # agentic
35
  task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
 
44
  # Your leaderboard name
45
  TITLE = """<h1 align="center" id="space-title">Vector State of Evaluation Leaderboard</h1>"""
46
 
47
+ SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
48
  AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
49
 
50
  # What does your leaderboard evaluate?
51
  INTRODUCTION_TEXT = f"""
52
+ This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: base and agentic. The base tasks are: {SINGLE_TURN_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
53
 
54
  # Which evaluations are you running? how can people reproduce what you have?
55
  LLM_BENCHMARKS_TEXT = f"""
56
  ## How it works
57
  The following benchmarks are included:
58
 
59
+ Base: {SINGLE_TURN_TASK_NAMES}
60
 
61
  Agentic: {AGENTIC_TASK_NAMES}
62
 
src/display/formatting.py CHANGED
@@ -2,9 +2,8 @@ def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
 
9
 
10
  def styled_error(error):
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
+ def make_clickable_model(model_name, model_sha):
6
+ return model_hyperlink(model_sha, model_name)
 
7
 
8
 
9
  def styled_error(error):
src/populate.py CHANGED
@@ -66,7 +66,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
66
  with open(file_path) as fp:
67
  data = json.load(fp)
68
 
69
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
70
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
71
 
72
  all_evals.append(data)
@@ -78,7 +78,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
78
  with open(file_path) as fp:
79
  data = json.load(fp)
80
 
81
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
82
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
83
  all_evals.append(data)
84
 
 
66
  with open(file_path) as fp:
67
  data = json.load(fp)
68
 
69
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"], data["model_sha"])
70
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
71
 
72
  all_evals.append(data)
 
78
  with open(file_path) as fp:
79
  data = json.load(fp)
80
 
81
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"], data["model_sha"])
82
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
83
  all_evals.append(data)
84