lewtun HF Staff commited on
Commit
7ca9efe
·
verified ·
1 Parent(s): f930f2a

Upload eval_results/open-r1/R1-Distill-Qwen-7B/v01.01-step-000001300/gpqa/results_2025-05-05T09-19-57.847824.json with huggingface_hub

Browse files
eval_results/open-r1/R1-Distill-Qwen-7B/v01.01-step-000001300/gpqa/results_2025-05-05T09-19-57.847824.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 1004550.475187005,
8
+ "end_time": 1006197.299452398,
9
+ "total_evaluation_time_secondes": "1646.8242653929628",
10
+ "model_name": "open-r1/R1-Distill-Qwen-7B",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 0.6,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "lighteval|gpqa:diamond|0": {
34
+ "gpqa_pass@1:1_samples": 0.3282828282828283,
35
+ "gpqa_pass@1:1_samples_stderr": 0.03345678422756776,
36
+ "gpqa_pass@1:4_samples": 0.3712121212121212,
37
+ "gpqa_pass@1:4_samples_stderr": 0.024980589855537083,
38
+ "gpqa_pass@1:8_samples": 0.3806818181818182,
39
+ "gpqa_pass@1:8_samples_stderr": 0.02203948584776927
40
+ },
41
+ "all": {
42
+ "gpqa_pass@1:1_samples": 0.3282828282828283,
43
+ "gpqa_pass@1:1_samples_stderr": 0.03345678422756776,
44
+ "gpqa_pass@1:4_samples": 0.3712121212121212,
45
+ "gpqa_pass@1:4_samples_stderr": 0.024980589855537083,
46
+ "gpqa_pass@1:8_samples": 0.3806818181818182,
47
+ "gpqa_pass@1:8_samples_stderr": 0.02203948584776927
48
+ }
49
+ },
50
+ "versions": {
51
+ "lighteval|gpqa:diamond|0": 1
52
+ },
53
+ "config_tasks": {
54
+ "lighteval|gpqa:diamond": {
55
+ "name": "gpqa:diamond",
56
+ "prompt_function": "gpqa_instruct",
57
+ "hf_repo": "Idavidrein/gpqa",
58
+ "hf_subset": "gpqa_diamond",
59
+ "metric": [
60
+ {
61
+ "metric_name": "gpqa_pass@1:1_samples",
62
+ "higher_is_better": true,
63
+ "category": "5",
64
+ "use_case": "6",
65
+ "sample_level_fn": "compute",
66
+ "corpus_level_fn": "mean"
67
+ },
68
+ {
69
+ "metric_name": "gpqa_pass@1:4_samples",
70
+ "higher_is_better": true,
71
+ "category": "5",
72
+ "use_case": "6",
73
+ "sample_level_fn": "compute",
74
+ "corpus_level_fn": "mean"
75
+ },
76
+ {
77
+ "metric_name": "gpqa_pass@1:8_samples",
78
+ "higher_is_better": true,
79
+ "category": "5",
80
+ "use_case": "6",
81
+ "sample_level_fn": "compute",
82
+ "corpus_level_fn": "mean"
83
+ }
84
+ ],
85
+ "hf_revision": null,
86
+ "hf_filter": null,
87
+ "hf_avail_splits": [
88
+ "train"
89
+ ],
90
+ "trust_dataset": true,
91
+ "evaluation_splits": [
92
+ "train"
93
+ ],
94
+ "few_shots_split": null,
95
+ "few_shots_select": null,
96
+ "generation_size": 32768,
97
+ "generation_grammar": null,
98
+ "stop_sequence": [],
99
+ "num_samples": null,
100
+ "suite": [
101
+ "lighteval"
102
+ ],
103
+ "original_num_docs": 198,
104
+ "effective_num_docs": 198,
105
+ "must_remove_duplicate_docs": false,
106
+ "version": 1
107
+ }
108
+ },
109
+ "summary_tasks": {
110
+ "lighteval|gpqa:diamond|0": {
111
+ "hashes": {
112
+ "hash_examples": "50ecb6f5d091bd95",
113
+ "hash_full_prompts": "50819126b814bd20",
114
+ "hash_input_tokens": "b3af494cd7017f80",
115
+ "hash_cont_tokens": "ece4cd6d91c030fd"
116
+ },
117
+ "truncated": 0,
118
+ "non_truncated": 198,
119
+ "padded": 0,
120
+ "non_padded": 198,
121
+ "effective_few_shots": 0.0,
122
+ "num_truncated_few_shots": 0
123
+ }
124
+ },
125
+ "summary_general": {
126
+ "hashes": {
127
+ "hash_examples": "a9318dbdd867770b",
128
+ "hash_full_prompts": "9c2b2dec42145c31",
129
+ "hash_input_tokens": "5f9c09adb003c130",
130
+ "hash_cont_tokens": "1456a607c54fab5d"
131
+ },
132
+ "truncated": 0,
133
+ "non_truncated": 198,
134
+ "padded": 0,
135
+ "non_padded": 198,
136
+ "num_truncated_few_shots": 0
137
+ }
138
+ }