open-r1-eval-leaderboard

Running

App Files Files Community

edbeeching HF Staff commited on Jan 23

Commit

2633aab

verified ·

1 Parent(s): c06cbaf

Upload eval_results/Qwen/Qwen2.5-Math-1.5B-Instruct/main/mini_math/results_2025-01-23T09-04-03.274525.json with huggingface_hub

Browse files

Files changed (1) hide show

eval_results/Qwen/Qwen2.5-Math-1.5B-Instruct/main/mini_math/results_2025-01-23T09-04-03.274525.json +340 -0

eval_results/Qwen/Qwen2.5-Math-1.5B-Instruct/main/mini_math/results_2025-01-23T09-04-03.274525.json ADDED Viewed

	@@ -0,0 +1,340 @@

+{
+  "config_general": {
+    "lighteval_sha": "?",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": -1,
+    "max_samples": null,
+    "job_id": 0,
+    "start_time": 466297.931738236,
+    "end_time": 466364.583520329,
+    "total_evaluation_time_secondes": "66.65178209298756",
+    "model_name": "Qwen/Qwen2.5-Math-1.5B-Instruct",
+    "model_sha": "",
+    "model_dtype": null,
+    "model_size": null
+  },
+  "results": {
+    "custom|mini_math:level_1|0": {
+      "qem": 0.8571428571428571,
+      "qem_stderr": 0.060012003601200396
+    },
+    "custom|mini_math:level_2|0": {
+      "qem": 0.8611111111111112,
+      "qem_stderr": 0.04104253469235572
+    },
+    "custom|mini_math:level_3|0": {
+      "qem": 0.8777777777777778,
+      "qem_stderr": 0.03471941311982698
+    },
+    "custom|mini_math:level_4|0": {
+      "qem": 0.711340206185567,
+      "qem_stderr": 0.04624834650754217
+    },
+    "custom|mini_math:level_5|0": {
+      "qem": 0.42452830188679247,
+      "qem_stderr": 0.04823593037243471
+    },
+    "custom|mini_math:_average|0": {
+      "qem": 0.7463800508208212,
+      "qem_stderr": 0.046051645658671994
+    },
+    "all": {
+      "qem": 0.7463800508208212,
+      "qem_stderr": 0.046051645658671994
+    }
+  },
+  "versions": {
+    "custom|mini_math:level_1|0": 0,
+    "custom|mini_math:level_2|0": 0,
+    "custom|mini_math:level_3|0": 0,
+    "custom|mini_math:level_4|0": 0,
+    "custom|mini_math:level_5|0": 0
+  },
+  "config_tasks": {
+    "custom|mini_math:level_1": {
+      "name": "mini_math:level_1",
+      "prompt_function": "math",
+      "hf_repo": "AI-MO/lighteval-mini-math",
+      "hf_subset": "Level 1",
+      "metric": [
+        {
+          "metric_name": "qem",
+          "higher_is_better": true,
+          "category": "3",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train",
+        "test"
+      ],
+      "trust_dataset": false,
+      "evaluation_splits": [
+        "test"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 2048,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom",
+        "mini_math"
+      ],
+      "original_num_docs": 35,
+      "effective_num_docs": 35,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    },
+    "custom|mini_math:level_2": {
+      "name": "mini_math:level_2",
+      "prompt_function": "math",
+      "hf_repo": "AI-MO/lighteval-mini-math",
+      "hf_subset": "Level 2",
+      "metric": [
+        {
+          "metric_name": "qem",
+          "higher_is_better": true,
+          "category": "3",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train",
+        "test"
+      ],
+      "trust_dataset": false,
+      "evaluation_splits": [
+        "test"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 2048,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom",
+        "mini_math"
+      ],
+      "original_num_docs": 72,
+      "effective_num_docs": 72,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    },
+    "custom|mini_math:level_3": {
+      "name": "mini_math:level_3",
+      "prompt_function": "math",
+      "hf_repo": "AI-MO/lighteval-mini-math",
+      "hf_subset": "Level 3",
+      "metric": [
+        {
+          "metric_name": "qem",
+          "higher_is_better": true,
+          "category": "3",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train",
+        "test"
+      ],
+      "trust_dataset": false,
+      "evaluation_splits": [
+        "test"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 2048,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom",
+        "mini_math"
+      ],
+      "original_num_docs": 90,
+      "effective_num_docs": 90,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    },
+    "custom|mini_math:level_4": {
+      "name": "mini_math:level_4",
+      "prompt_function": "math",
+      "hf_repo": "AI-MO/lighteval-mini-math",
+      "hf_subset": "Level 4",
+      "metric": [
+        {
+          "metric_name": "qem",
+          "higher_is_better": true,
+          "category": "3",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train",
+        "test"
+      ],
+      "trust_dataset": false,
+      "evaluation_splits": [
+        "test"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 2048,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom",
+        "mini_math"
+      ],
+      "original_num_docs": 97,
+      "effective_num_docs": 97,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    },
+    "custom|mini_math:level_5": {
+      "name": "mini_math:level_5",
+      "prompt_function": "math",
+      "hf_repo": "AI-MO/lighteval-mini-math",
+      "hf_subset": "Level 5",
+      "metric": [
+        {
+          "metric_name": "qem",
+          "higher_is_better": true,
+          "category": "3",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        }
+      ],
+      "hf_revision": null,
+      "hf_filter": null,
+      "hf_avail_splits": [
+        "train",
+        "test"
+      ],
+      "trust_dataset": false,
+      "evaluation_splits": [
+        "test"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": null,
+      "generation_size": 2048,
+      "generation_grammar": null,
+      "stop_sequence": [],
+      "num_samples": null,
+      "suite": [
+        "custom",
+        "mini_math"
+      ],
+      "original_num_docs": 106,
+      "effective_num_docs": 106,
+      "must_remove_duplicate_docs": false,
+      "version": 0
+    }
+  },
+  "summary_tasks": {
+    "custom|mini_math:level_1|0": {
+      "hashes": {
+        "hash_examples": "f57f08853806c023",
+        "hash_full_prompts": "9a3a20fd93db6f19",
+        "hash_input_tokens": "b870f759aaf9fa2e",
+        "hash_cont_tokens": "13340a9b824c56a8"
+      },
+      "truncated": 0,
+      "non_truncated": 35,
+      "padded": 0,
+      "non_padded": 35,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    },
+    "custom|mini_math:level_2|0": {
+      "hashes": {
+        "hash_examples": "b4fc7cab14532241",
+        "hash_full_prompts": "e9cf9a7613462711",
+        "hash_input_tokens": "97a7ed80eadc63e3",
+        "hash_cont_tokens": "cdcb1b694822ca5d"
+      },
+      "truncated": 0,
+      "non_truncated": 72,
+      "padded": 0,
+      "non_padded": 72,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    },
+    "custom|mini_math:level_3|0": {
+      "hashes": {
+        "hash_examples": "07877d35e91b6109",
+        "hash_full_prompts": "c1f9f7bceef1a641",
+        "hash_input_tokens": "818a30a7ae391ca5",
+        "hash_cont_tokens": "aad18f03e74d477c"
+      },
+      "truncated": 0,
+      "non_truncated": 90,
+      "padded": 0,
+      "non_padded": 90,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    },
+    "custom|mini_math:level_4|0": {
+      "hashes": {
+        "hash_examples": "016981666505c34d",
+        "hash_full_prompts": "e583c079179c8459",
+        "hash_input_tokens": "4828de070bbe53be",
+        "hash_cont_tokens": "29e2954e09d16e1d"
+      },
+      "truncated": 0,
+      "non_truncated": 97,
+      "padded": 0,
+      "non_padded": 97,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    },
+    "custom|mini_math:level_5|0": {
+      "hashes": {
+        "hash_examples": "80868d5b1c260707",
+        "hash_full_prompts": "d45915cec626c593",
+        "hash_input_tokens": "5fd7765f4e8a2a9c",
+        "hash_cont_tokens": "7197eeac005410bb"
+      },
+      "truncated": 0,
+      "non_truncated": 106,
+      "padded": 0,
+      "non_padded": 106,
+      "effective_few_shots": 0.0,
+      "num_truncated_few_shots": 0
+    }
+  },
+  "summary_general": {
+    "hashes": {
+      "hash_examples": "e7ee3a47ca2b0866",
+      "hash_full_prompts": "1f7e06ec8d733918",
+      "hash_input_tokens": "4bcc69b39c8416ec",
+      "hash_cont_tokens": "dd8153c97fb558fc"
+    },
+    "truncated": 0,
+    "non_truncated": 400,
+    "padded": 0,
+    "non_padded": 400,
+    "num_truncated_few_shots": 0
+  }
+}