Add AlpacaEval
Browse files
app.py
CHANGED
@@ -64,6 +64,9 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
64 |
# MATH reports qem
|
65 |
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
|
66 |
value = data["results"]["all"]["qem"]
|
|
|
|
|
|
|
67 |
else:
|
68 |
first_metric_key = next(
|
69 |
iter(data["results"][first_result_key])
|
@@ -80,13 +83,15 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
80 |
else:
|
81 |
df.loc[model_revision, task] = value
|
82 |
|
83 |
-
# Put IFEval / BBH / AGIEval in first columns
|
|
|
|
|
84 |
ifeval_col = df.pop("Ifeval")
|
85 |
-
df.insert(
|
86 |
bbh_col = df.pop("Bbh")
|
87 |
-
df.insert(
|
88 |
agieval_col = df.pop("Agieval")
|
89 |
-
df.insert(
|
90 |
# Drop rows where every entry is NaN
|
91 |
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
|
92 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
|
|
64 |
# MATH reports qem
|
65 |
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
|
66 |
value = data["results"]["all"]["qem"]
|
67 |
+
# Report length controlled winrate for AlpacaEval
|
68 |
+
elif task.lower() == "alpaca_eval":
|
69 |
+
value = data["results"][first_result_key]["length_controlled_winrate"] / 100.0
|
70 |
else:
|
71 |
first_metric_key = next(
|
72 |
iter(data["results"][first_result_key])
|
|
|
83 |
else:
|
84 |
df.loc[model_revision, task] = value
|
85 |
|
86 |
+
# Put IFEval / BBH / AGIEval / AlpacaEval in first columns
|
87 |
+
alpaca_col = df.pop("Alpaca_eval")
|
88 |
+
df.insert(1, "Alpaca_eval", alpaca_col)
|
89 |
ifeval_col = df.pop("Ifeval")
|
90 |
+
df.insert(2, "Ifeval", ifeval_col)
|
91 |
bbh_col = df.pop("Bbh")
|
92 |
+
df.insert(3, "Bbh", bbh_col)
|
93 |
agieval_col = df.pop("Agieval")
|
94 |
+
df.insert(4, "Agieval", agieval_col)
|
95 |
# Drop rows where every entry is NaN
|
96 |
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
|
97 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|