lewtun HF staff commited on
Commit
37ce6d8
·
1 Parent(s): b6155d5

Report correct IFEval score

Browse files
Files changed (1) hide show
  1. app.py +6 -0
app.py CHANGED
@@ -39,6 +39,9 @@ def get_leaderboard_df():
39
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
40
  if task == "truthfulqa":
41
  value = data["results"][first_result_key]["truthfulqa_mc2"]
 
 
 
42
  else:
43
  first_metric_key = next(
44
  iter(data["results"][first_result_key])
@@ -46,6 +49,9 @@ def get_leaderboard_df():
46
  value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
47
  df.loc[model_revision, task] = value
48
 
 
 
 
49
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
50
  df = df.sort_values(by=["Average"], ascending=False)
51
  df = df.reset_index().rename(columns={"index": "Model"}).round(3)
 
39
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
40
  if task == "truthfulqa":
41
  value = data["results"][first_result_key]["truthfulqa_mc2"]
42
+ # IFEval has several metrics but we report just the prompt-loose-acc one
43
+ elif task == "ifeval":
44
+ value = data["results"][first_result_key]["prompt_level_loose_acc"]
45
  else:
46
  first_metric_key = next(
47
  iter(data["results"][first_result_key])
 
49
  value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
50
  df.loc[model_revision, task] = value
51
 
52
+ # Put IFEval in first column
53
+ ifeval_col = df.pop("Ifeval")
54
+ df.insert(1, "Ifeval", ifeval_col)
55
  df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
56
  df = df.sort_values(by=["Average"], ascending=False)
57
  df = df.reset_index().rename(columns={"index": "Model"}).round(3)