Fix PoT
Browse files
app.py
CHANGED
@@ -77,6 +77,20 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
77 |
level = k.split("|")[1].split(":")[-1]
|
78 |
value = v["qem"]
|
79 |
df.loc[model_revision, f"{task}_{level}"] = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
# For AlpacaEval we report base winrate and lenght corrected one
|
81 |
elif task.lower() == "alpaca_eval":
|
82 |
value = data["results"][first_result_key]["win_rate"]
|
|
|
77 |
level = k.split("|")[1].split(":")[-1]
|
78 |
value = v["qem"]
|
79 |
df.loc[model_revision, f"{task}_{level}"] = value
|
80 |
+
# For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
81 |
+
elif task.lower() in ["aimo_kaggle_medium_pot"]:
|
82 |
+
for k, v in data["results"].items():
|
83 |
+
if k != "all" and "_average" not in k:
|
84 |
+
version = k.split("|")[1].split(":")[-1]
|
85 |
+
value = v["qem"]
|
86 |
+
df.loc[model_revision, f"{task}_{version}"] = value
|
87 |
+
# For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
88 |
+
elif task.lower() in ["aimo_kaggle_hard_pot"]:
|
89 |
+
for k, v in data["results"].items():
|
90 |
+
if k != "all" and "_average" not in k:
|
91 |
+
version = k.split("|")[1].split(":")[-1]
|
92 |
+
value = v["qem"]
|
93 |
+
df.loc[model_revision, f"{task}_{version}"] = value
|
94 |
# For AlpacaEval we report base winrate and lenght corrected one
|
95 |
elif task.lower() == "alpaca_eval":
|
96 |
value = data["results"][first_result_key]["win_rate"]
|