open-r1-eval-leaderboard

Running

App Files Files Community

lewtun HF Staff commited on May 5

Commit

f930f2a

1 Parent(s): 0a8a8b8

Add pass@1

Browse files

Files changed (1) hide show

app.py +27 -6

app.py CHANGED Viewed

@@ -82,12 +82,33 @@ def get_leaderboard_df():
                     df.loc[model_revision, task] = float(value)
                 # AIME24 and 25 report pass@1
                 elif task.lower() in ["aime24", "aime25"]:
-                    value = (
-                        data["results"]["all"]["math_pass@1:32_samples"]
-                        if "math_pass@1:32_samples" in data["results"]["all"]
-                        else -1
-                    )
-                    df.loc[model_revision, task] = float(value)
                 # MATH reports qem
                 elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
                     value = data["results"]["all"]["qem"]

                     df.loc[model_revision, task] = float(value)
                 # AIME24 and 25 report pass@1
                 elif task.lower() in ["aime24", "aime25"]:
+                    # Check for 32 samples
+                    if "math_pass@1:32_samples" in data["results"]["all"]:
+                        value = data["results"]["all"]["math_pass@1:32_samples"]
+                        df.loc[model_revision, f"{task} (n=32)"] = float(value)
+                    # Check for 64 samples
+                    if "math_pass@1:64_samples" in data["results"]["all"]:
+                        value = data["results"]["all"]["math_pass@1:64_samples"]
+                        df.loc[model_revision, f"{task} (n=64)"] = float(value)
+                    # For backward compatibility, also store in the original column name if any value exists
+                    if "math_pass@1:32_samples" in data["results"]["all"]:
+                        df.loc[model_revision, task] = float(data["results"]["all"]["math_pass@1:32_samples"])
+                    elif "math_pass@1:64_samples" in data["results"]["all"]:
+                        df.loc[model_revision, task] = float(data["results"]["all"]["math_pass@1:64_samples"])
+                # GPQA now reports pass@1
+                elif task.lower() == "gpqa":
+                    # Check for 8 samples
+                    if "gpqa_pass@1:8_samples" in data["results"]["all"]:
+                        value = data["results"]["all"]["gpqa_pass@1:8_samples"]
+                        df.loc[model_revision, f"{task} (n=8)"] = float(value)
+                    # For backward compatibility, also store in the original column name if any value exists
+                    if "extractive_match" in data["results"]["all"]:
+                        df.loc[model_revision, task] = float(data["results"]["all"]["extractive_match"])
+                    elif "gpqa_pass@1:8_samples" in data["results"]["all"]:
+                        df.loc[model_revision, task] = float(data["results"]["all"]["gpqa_pass@1:8_samples"])
                 # MATH reports qem
                 elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
                     value = data["results"]["all"]["qem"]