Set merge as default
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@ Evaluation of H4 and community models across a diverse range of benchmarks from
|
|
11 |
"""
|
12 |
|
13 |
|
14 |
-
def get_leaderboard_df(merge_values: bool =
|
15 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
16 |
|
17 |
# Parse filepaths to get unique models
|
@@ -47,6 +47,9 @@ def get_leaderboard_df(merge_values: bool = False):
|
|
47 |
# HellaSwag and ARC reports acc_norm
|
48 |
elif task.lower() in ["hellaswag", "arc"]:
|
49 |
value = data["results"][first_result_key]["acc_norm"]
|
|
|
|
|
|
|
50 |
else:
|
51 |
first_metric_key = next(
|
52 |
iter(data["results"][first_result_key])
|
@@ -76,7 +79,7 @@ def get_leaderboard_df(merge_values: bool = False):
|
|
76 |
return df
|
77 |
|
78 |
|
79 |
-
def refresh(merge_values: bool =
|
80 |
return get_leaderboard_df(merge_values)
|
81 |
|
82 |
|
|
|
11 |
"""
|
12 |
|
13 |
|
14 |
+
def get_leaderboard_df(merge_values: bool = True):
|
15 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
16 |
|
17 |
# Parse filepaths to get unique models
|
|
|
47 |
# HellaSwag and ARC reports acc_norm
|
48 |
elif task.lower() in ["hellaswag", "arc"]:
|
49 |
value = data["results"][first_result_key]["acc_norm"]
|
50 |
+
# BBH has several metrics but we report just the average one
|
51 |
+
elif task.lower() == "bbh":
|
52 |
+
value = [v["em"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|
53 |
else:
|
54 |
first_metric_key = next(
|
55 |
iter(data["results"][first_result_key])
|
|
|
79 |
return df
|
80 |
|
81 |
|
82 |
+
def refresh(merge_values: bool = True):
|
83 |
return get_leaderboard_df(merge_values)
|
84 |
|
85 |
|