lewtun HF Staff commited on
Commit
069c829
·
verified ·
1 Parent(s): 5e41202

Upload eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/gpqa/results_2025-05-05T15-29-20.168420.json with huggingface_hub

Browse files
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/gpqa/results_2025-05-05T15-29-20.168420.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "max_samples": null,
6
+ "job_id": 0,
7
+ "start_time": 3620506.368588403,
8
+ "end_time": 3621489.119445614,
9
+ "total_evaluation_time_secondes": "982.7508572111838",
10
+ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
11
+ "model_sha": "",
12
+ "model_dtype": null,
13
+ "model_size": null,
14
+ "generation_parameters": {
15
+ "early_stopping": null,
16
+ "repetition_penalty": null,
17
+ "frequency_penalty": null,
18
+ "length_penalty": null,
19
+ "presence_penalty": null,
20
+ "max_new_tokens": 32768,
21
+ "min_new_tokens": null,
22
+ "seed": null,
23
+ "stop_tokens": null,
24
+ "temperature": 0.6,
25
+ "top_k": null,
26
+ "min_p": null,
27
+ "top_p": 0.95,
28
+ "truncate_prompt": null,
29
+ "response_format": null
30
+ }
31
+ },
32
+ "results": {
33
+ "lighteval|gpqa:diamond|0": {
34
+ "gpqa_pass@1:1_samples": 0.494949494949495,
35
+ "gpqa_pass@1:1_samples_stderr": 0.035621707606254015,
36
+ "gpqa_pass@1:4_samples": 0.494949494949495,
37
+ "gpqa_pass@1:4_samples_stderr": 0.028918511422630695,
38
+ "gpqa_pass@1:8_samples": 0.5050505050505051,
39
+ "gpqa_pass@1:8_samples_stderr": 0.027279058659552692
40
+ },
41
+ "all": {
42
+ "gpqa_pass@1:1_samples": 0.494949494949495,
43
+ "gpqa_pass@1:1_samples_stderr": 0.035621707606254015,
44
+ "gpqa_pass@1:4_samples": 0.494949494949495,
45
+ "gpqa_pass@1:4_samples_stderr": 0.028918511422630695,
46
+ "gpqa_pass@1:8_samples": 0.5050505050505051,
47
+ "gpqa_pass@1:8_samples_stderr": 0.027279058659552692
48
+ }
49
+ },
50
+ "versions": {
51
+ "lighteval|gpqa:diamond|0": 1
52
+ },
53
+ "config_tasks": {
54
+ "lighteval|gpqa:diamond": {
55
+ "name": "gpqa:diamond",
56
+ "prompt_function": "gpqa_instruct",
57
+ "hf_repo": "Idavidrein/gpqa",
58
+ "hf_subset": "gpqa_diamond",
59
+ "metric": [
60
+ {
61
+ "metric_name": "gpqa_pass@1:1_samples",
62
+ "higher_is_better": true,
63
+ "category": "5",
64
+ "use_case": "6",
65
+ "sample_level_fn": "compute",
66
+ "corpus_level_fn": "mean"
67
+ },
68
+ {
69
+ "metric_name": "gpqa_pass@1:4_samples",
70
+ "higher_is_better": true,
71
+ "category": "5",
72
+ "use_case": "6",
73
+ "sample_level_fn": "compute",
74
+ "corpus_level_fn": "mean"
75
+ },
76
+ {
77
+ "metric_name": "gpqa_pass@1:8_samples",
78
+ "higher_is_better": true,
79
+ "category": "5",
80
+ "use_case": "6",
81
+ "sample_level_fn": "compute",
82
+ "corpus_level_fn": "mean"
83
+ }
84
+ ],
85
+ "hf_revision": null,
86
+ "hf_filter": null,
87
+ "hf_avail_splits": [
88
+ "train"
89
+ ],
90
+ "trust_dataset": true,
91
+ "evaluation_splits": [
92
+ "train"
93
+ ],
94
+ "few_shots_split": null,
95
+ "few_shots_select": null,
96
+ "generation_size": 32768,
97
+ "generation_grammar": null,
98
+ "stop_sequence": [],
99
+ "num_samples": null,
100
+ "suite": [
101
+ "lighteval"
102
+ ],
103
+ "original_num_docs": 198,
104
+ "effective_num_docs": 198,
105
+ "must_remove_duplicate_docs": false,
106
+ "version": 1
107
+ }
108
+ },
109
+ "summary_tasks": {
110
+ "lighteval|gpqa:diamond|0": {
111
+ "hashes": {
112
+ "hash_examples": "50ecb6f5d091bd95",
113
+ "hash_full_prompts": "f27d3d556ab1c377",
114
+ "hash_input_tokens": "2feb29bf3801b58a",
115
+ "hash_cont_tokens": "e41f97da9d2685b2"
116
+ },
117
+ "truncated": 0,
118
+ "non_truncated": 198,
119
+ "padded": 0,
120
+ "non_padded": 198,
121
+ "effective_few_shots": 0.0,
122
+ "num_truncated_few_shots": 0
123
+ }
124
+ },
125
+ "summary_general": {
126
+ "hashes": {
127
+ "hash_examples": "a9318dbdd867770b",
128
+ "hash_full_prompts": "122c405ea50f3de4",
129
+ "hash_input_tokens": "a263b8e88b1dea74",
130
+ "hash_cont_tokens": "bb93b1c97d7cd5b0"
131
+ },
132
+ "truncated": 0,
133
+ "non_truncated": 198,
134
+ "padded": 0,
135
+ "non_padded": 198,
136
+ "num_truncated_few_shots": 0
137
+ }
138
+ }