lewtun HF staff commited on
Commit
7ac902b
·
1 Parent(s): 06f2306

Hide math and mini_math

Browse files
Files changed (1) hide show
  1. app.py +10 -2
app.py CHANGED
@@ -10,6 +10,8 @@ DESCRIPTION = f"""
10
  Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
11
  """
12
 
 
 
13
 
14
  def get_leaderboard_df(merge_values: bool = True):
15
  filepaths = list(Path("eval_results").rglob("*.json"))
@@ -35,6 +37,9 @@ def get_leaderboard_df(merge_values: bool = True):
35
  with open(filepath, "r") as file:
36
  data = json.load(file)
37
  first_result_key = next(iter(data["results"])) # gets the first key in 'results'
 
 
 
38
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
39
  if task.lower() == "truthfulqa":
40
  value = data["results"][first_result_key]["truthfulqa_mc2"]
@@ -66,7 +71,7 @@ def get_leaderboard_df(merge_values: bool = True):
66
  value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
67
 
68
  # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
69
- if task.lower() in ["mini_math", "mini_math_v2"]:
70
  for k, v in data["results"].items():
71
  if k != "all":
72
  level = k.split("|")[1].split(":")[-1]
@@ -98,6 +103,9 @@ def get_leaderboard_df(merge_values: bool = True):
98
  df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
99
  df.drop_duplicates(subset=["Model"], inplace=True)
100
  df = df.sort_values(by=["Average"], ascending=False).round(2)
 
 
 
101
  return df
102
 
103
 
@@ -137,7 +145,7 @@ with demo:
137
  value=leaderboard_df,
138
  wrap=True,
139
  height=1000,
140
- column_widths=[400, 110] + [155] * len(leaderboard_df.columns[2:]),
141
  )
142
  with gr.Row():
143
  refresh_button = gr.Button("Refresh")
 
10
  Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
11
  """
12
 
13
+ BENCHMARKS_TO_SKIP = ["math", "mini_math"]
14
+
15
 
16
  def get_leaderboard_df(merge_values: bool = True):
17
  filepaths = list(Path("eval_results").rglob("*.json"))
 
37
  with open(filepath, "r") as file:
38
  data = json.load(file)
39
  first_result_key = next(iter(data["results"])) # gets the first key in 'results'
40
+ # Skip benchmarks that we don't want to include in the leaderboard
41
+ if task.lower() in BENCHMARKS_TO_SKIP:
42
+ continue
43
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
44
  if task.lower() == "truthfulqa":
45
  value = data["results"][first_result_key]["truthfulqa_mc2"]
 
71
  value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
72
 
73
  # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
74
+ if task.lower() in ["mini_math_v2"]:
75
  for k, v in data["results"].items():
76
  if k != "all":
77
  level = k.split("|")[1].split(":")[-1]
 
103
  df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
104
  df.drop_duplicates(subset=["Model"], inplace=True)
105
  df = df.sort_values(by=["Average"], ascending=False).round(2)
106
+
107
+ # Trim minimath column names
108
+ df.columns = [c.replace("_level_", "_l") for c in df.columns]
109
  return df
110
 
111
 
 
145
  value=leaderboard_df,
146
  wrap=True,
147
  height=1000,
148
+ column_widths=[400, 110] + [(150 + len(c)) for c in leaderboard_df.columns[2:]],
149
  )
150
  with gr.Row():
151
  refresh_button = gr.Button("Refresh")