Report correct IFEval score
Browse files
app.py
CHANGED
@@ -39,6 +39,9 @@ def get_leaderboard_df():
|
|
39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
40 |
if task == "truthfulqa":
|
41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
|
|
|
|
|
|
42 |
else:
|
43 |
first_metric_key = next(
|
44 |
iter(data["results"][first_result_key])
|
@@ -46,6 +49,9 @@ def get_leaderboard_df():
|
|
46 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
47 |
df.loc[model_revision, task] = value
|
48 |
|
|
|
|
|
|
|
49 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
50 |
df = df.sort_values(by=["Average"], ascending=False)
|
51 |
df = df.reset_index().rename(columns={"index": "Model"}).round(3)
|
|
|
39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
40 |
if task == "truthfulqa":
|
41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
42 |
+
# IFEval has several metrics but we report just the prompt-loose-acc one
|
43 |
+
elif task == "ifeval":
|
44 |
+
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
45 |
else:
|
46 |
first_metric_key = next(
|
47 |
iter(data["results"][first_result_key])
|
|
|
49 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
50 |
df.loc[model_revision, task] = value
|
51 |
|
52 |
+
# Put IFEval in first column
|
53 |
+
ifeval_col = df.pop("Ifeval")
|
54 |
+
df.insert(1, "Ifeval", ifeval_col)
|
55 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
56 |
df = df.sort_values(by=["Average"], ascending=False)
|
57 |
df = df.reset_index().rename(columns={"index": "Model"}).round(3)
|