lewtun HF staff commited on
Commit
d48b380
·
1 Parent(s): 3f32071
Files changed (1) hide show
  1. app.py +14 -0
app.py CHANGED
@@ -77,6 +77,20 @@ def get_leaderboard_df(merge_values: bool = True):
77
  level = k.split("|")[1].split(":")[-1]
78
  value = v["qem"]
79
  df.loc[model_revision, f"{task}_{level}"] = value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  # For AlpacaEval we report base winrate and lenght corrected one
81
  elif task.lower() == "alpaca_eval":
82
  value = data["results"][first_result_key]["win_rate"]
 
77
  level = k.split("|")[1].split(":")[-1]
78
  value = v["qem"]
79
  df.loc[model_revision, f"{task}_{level}"] = value
80
+ # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
81
+ elif task.lower() in ["aimo_kaggle_medium_pot"]:
82
+ for k, v in data["results"].items():
83
+ if k != "all" and "_average" not in k:
84
+ version = k.split("|")[1].split(":")[-1]
85
+ value = v["qem"]
86
+ df.loc[model_revision, f"{task}_{version}"] = value
87
+ # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
88
+ elif task.lower() in ["aimo_kaggle_hard_pot"]:
89
+ for k, v in data["results"].items():
90
+ if k != "all" and "_average" not in k:
91
+ version = k.split("|")[1].split(":")[-1]
92
+ value = v["qem"]
93
+ df.loc[model_revision, f"{task}_{version}"] = value
94
  # For AlpacaEval we report base winrate and lenght corrected one
95
  elif task.lower() == "alpaca_eval":
96
  value = data["results"][first_result_key]["win_rate"]