Hide math and mini_math
Browse files
app.py
CHANGED
@@ -10,6 +10,8 @@ DESCRIPTION = f"""
|
|
10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
11 |
"""
|
12 |
|
|
|
|
|
13 |
|
14 |
def get_leaderboard_df(merge_values: bool = True):
|
15 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
@@ -35,6 +37,9 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
35 |
with open(filepath, "r") as file:
|
36 |
data = json.load(file)
|
37 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
|
|
|
|
|
|
38 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
39 |
if task.lower() == "truthfulqa":
|
40 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
@@ -66,7 +71,7 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
66 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
67 |
|
68 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
69 |
-
if task.lower() in ["
|
70 |
for k, v in data["results"].items():
|
71 |
if k != "all":
|
72 |
level = k.split("|")[1].split(":")[-1]
|
@@ -98,6 +103,9 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
98 |
df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
|
99 |
df.drop_duplicates(subset=["Model"], inplace=True)
|
100 |
df = df.sort_values(by=["Average"], ascending=False).round(2)
|
|
|
|
|
|
|
101 |
return df
|
102 |
|
103 |
|
@@ -137,7 +145,7 @@ with demo:
|
|
137 |
value=leaderboard_df,
|
138 |
wrap=True,
|
139 |
height=1000,
|
140 |
-
column_widths=[400, 110] + [
|
141 |
)
|
142 |
with gr.Row():
|
143 |
refresh_button = gr.Button("Refresh")
|
|
|
10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
11 |
"""
|
12 |
|
13 |
+
BENCHMARKS_TO_SKIP = ["math", "mini_math"]
|
14 |
+
|
15 |
|
16 |
def get_leaderboard_df(merge_values: bool = True):
|
17 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
|
|
37 |
with open(filepath, "r") as file:
|
38 |
data = json.load(file)
|
39 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
40 |
+
# Skip benchmarks that we don't want to include in the leaderboard
|
41 |
+
if task.lower() in BENCHMARKS_TO_SKIP:
|
42 |
+
continue
|
43 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
44 |
if task.lower() == "truthfulqa":
|
45 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
|
|
71 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
72 |
|
73 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
74 |
+
if task.lower() in ["mini_math_v2"]:
|
75 |
for k, v in data["results"].items():
|
76 |
if k != "all":
|
77 |
level = k.split("|")[1].split(":")[-1]
|
|
|
103 |
df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
|
104 |
df.drop_duplicates(subset=["Model"], inplace=True)
|
105 |
df = df.sort_values(by=["Average"], ascending=False).round(2)
|
106 |
+
|
107 |
+
# Trim minimath column names
|
108 |
+
df.columns = [c.replace("_level_", "_l") for c in df.columns]
|
109 |
return df
|
110 |
|
111 |
|
|
|
145 |
value=leaderboard_df,
|
146 |
wrap=True,
|
147 |
height=1000,
|
148 |
+
column_widths=[400, 110] + [(150 + len(c)) for c in leaderboard_df.columns[2:]],
|
149 |
)
|
150 |
with gr.Row():
|
151 |
refresh_button = gr.Button("Refresh")
|