open-r1-eval-leaderboard

Running

App Files Files Community

lewtun HF Staff commited on Apr 29, 2024

Commit

7ac902b

1 Parent(s): 06f2306

Hide math and mini_math

Browse files

Files changed (1) hide show

app.py +10 -2

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ DESCRIPTION = f"""
 Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
 def get_leaderboard_df(merge_values: bool = True):
     filepaths = list(Path("eval_results").rglob("*.json"))
@@ -35,6 +37,9 @@ def get_leaderboard_df(merge_values: bool = True):
         with open(filepath, "r") as file:
             data = json.load(file)
             first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
             # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
             if task.lower() == "truthfulqa":
                 value = data["results"][first_result_key]["truthfulqa_mc2"]
@@ -66,7 +71,7 @@ def get_leaderboard_df(merge_values: bool = True):
                 value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
             # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
-            if task.lower() in ["mini_math", "mini_math_v2"]:
                 for k, v in data["results"].items():
                     if k != "all":
                         level = k.split("|")[1].split(":")[-1]
@@ -98,6 +103,9 @@ def get_leaderboard_df(merge_values: bool = True):
         df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
         df.drop_duplicates(subset=["Model"], inplace=True)
         df = df.sort_values(by=["Average"], ascending=False).round(2)
     return df
@@ -137,7 +145,7 @@ with demo:
                 value=leaderboard_df,
                 wrap=True,
                 height=1000,
-                column_widths=[400, 110] + [155] * len(leaderboard_df.columns[2:]),
             )
         with gr.Row():
             refresh_button = gr.Button("Refresh")

 Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
+BENCHMARKS_TO_SKIP = ["math", "mini_math"]
 def get_leaderboard_df(merge_values: bool = True):
     filepaths = list(Path("eval_results").rglob("*.json"))
         with open(filepath, "r") as file:
             data = json.load(file)
             first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
+            # Skip benchmarks that we don't want to include in the leaderboard
+            if task.lower() in BENCHMARKS_TO_SKIP:
+                continue
             # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
             if task.lower() == "truthfulqa":
                 value = data["results"][first_result_key]["truthfulqa_mc2"]
                 value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
             # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
+            if task.lower() in ["mini_math_v2"]:
                 for k, v in data["results"].items():
                     if k != "all":
                         level = k.split("|")[1].split(":")[-1]
         df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
         df.drop_duplicates(subset=["Model"], inplace=True)
         df = df.sort_values(by=["Average"], ascending=False).round(2)
+    # Trim minimath column names
+    df.columns = [c.replace("_level_", "_l") for c in df.columns]
     return df
                 value=leaderboard_df,
                 wrap=True,
                 height=1000,
+                column_widths=[400, 110] + [(150 + len(c)) for c in leaderboard_df.columns[2:]],
             )
         with gr.Row():
             refresh_button = gr.Button("Refresh")