edbeeching HF staff commited on
Commit
c3c0e98
·
verified ·
1 Parent(s): e48c0c4

Upload eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v01.16/aimo_kaggle_medium_pot/results_2024-06-06T03-56-49.956243.json with huggingface_hub

Browse files
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v01.16/aimo_kaggle_medium_pot/results_2024-06-06T03-56-49.956243.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 735639.865928377,
9
+ "end_time": 736127.784568667,
10
+ "total_evaluation_time_secondes": "487.9186402899213",
11
+ "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
+ "model_sha": "6e62641c9122d764f373b27e9e21e6ca411e8dd2",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "24.56 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|aimo_kaggle_medium_pot:v0|0": {
19
+ "qem": 0.075,
20
+ "qem_stderr": 0.04217636961434869
21
+ },
22
+ "custom|aimo_kaggle_medium_pot:v1|0": {
23
+ "qem": 0.075,
24
+ "qem_stderr": 0.04217636961434867
25
+ },
26
+ "custom|aimo_kaggle_medium_pot:v2|0": {
27
+ "qem": 0.125,
28
+ "qem_stderr": 0.05295740910852021
29
+ },
30
+ "custom|aimo_kaggle_medium_pot:_average|0": {
31
+ "qem": 0.09166666666666667,
32
+ "qem_stderr": 0.04577004944573918
33
+ },
34
+ "all": {
35
+ "qem": 0.09166666666666667,
36
+ "qem_stderr": 0.04577004944573918
37
+ }
38
+ },
39
+ "versions": {
40
+ "custom|aimo_kaggle_medium_pot:v0|0": 0,
41
+ "custom|aimo_kaggle_medium_pot:v1|0": 0,
42
+ "custom|aimo_kaggle_medium_pot:v2|0": 0
43
+ },
44
+ "config_tasks": {
45
+ "custom|aimo_kaggle_medium_pot:v0": {
46
+ "name": "aimo_kaggle_medium_pot:v0",
47
+ "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
48
+ "hf_repo": "AI-MO/kaggle-validation-set-medium",
49
+ "hf_subset": "v0",
50
+ "metric": [
51
+ "quasi_exact_match_code_and_math"
52
+ ],
53
+ "hf_avail_splits": [
54
+ "train"
55
+ ],
56
+ "evaluation_splits": [
57
+ "train"
58
+ ],
59
+ "few_shots_split": null,
60
+ "few_shots_select": null,
61
+ "generation_size": 2048,
62
+ "stop_sequence": null,
63
+ "output_regex": null,
64
+ "num_samples": null,
65
+ "frozen": false,
66
+ "suite": [
67
+ "custom"
68
+ ],
69
+ "original_num_docs": 40,
70
+ "effective_num_docs": 40,
71
+ "trust_dataset": null,
72
+ "must_remove_duplicate_docs": null,
73
+ "version": 0
74
+ },
75
+ "custom|aimo_kaggle_medium_pot:v1": {
76
+ "name": "aimo_kaggle_medium_pot:v1",
77
+ "prompt_function": "kaggle_medium_pot_prompt_fn_v1",
78
+ "hf_repo": "AI-MO/kaggle-validation-set-medium",
79
+ "hf_subset": "v0",
80
+ "metric": [
81
+ "quasi_exact_match_code_and_math"
82
+ ],
83
+ "hf_avail_splits": [
84
+ "train"
85
+ ],
86
+ "evaluation_splits": [
87
+ "train"
88
+ ],
89
+ "few_shots_split": null,
90
+ "few_shots_select": null,
91
+ "generation_size": 2048,
92
+ "stop_sequence": null,
93
+ "output_regex": null,
94
+ "num_samples": null,
95
+ "frozen": false,
96
+ "suite": [
97
+ "custom"
98
+ ],
99
+ "original_num_docs": 40,
100
+ "effective_num_docs": 40,
101
+ "trust_dataset": null,
102
+ "must_remove_duplicate_docs": null,
103
+ "version": 0
104
+ },
105
+ "custom|aimo_kaggle_medium_pot:v2": {
106
+ "name": "aimo_kaggle_medium_pot:v2",
107
+ "prompt_function": "kaggle_medium_pot_prompt_fn_v2",
108
+ "hf_repo": "AI-MO/kaggle-validation-set-medium",
109
+ "hf_subset": "v0",
110
+ "metric": [
111
+ "quasi_exact_match_code_and_math"
112
+ ],
113
+ "hf_avail_splits": [
114
+ "train"
115
+ ],
116
+ "evaluation_splits": [
117
+ "train"
118
+ ],
119
+ "few_shots_split": null,
120
+ "few_shots_select": null,
121
+ "generation_size": 2048,
122
+ "stop_sequence": null,
123
+ "output_regex": null,
124
+ "num_samples": null,
125
+ "frozen": false,
126
+ "suite": [
127
+ "custom"
128
+ ],
129
+ "original_num_docs": 40,
130
+ "effective_num_docs": 40,
131
+ "trust_dataset": null,
132
+ "must_remove_duplicate_docs": null,
133
+ "version": 0
134
+ }
135
+ },
136
+ "summary_tasks": {
137
+ "custom|aimo_kaggle_medium_pot:v0|0": {
138
+ "hashes": {
139
+ "hash_examples": "2799c24461029dc3",
140
+ "hash_full_prompts": "2af864fbfc2e0a79",
141
+ "hash_input_tokens": "e868d382e92028d6",
142
+ "hash_cont_tokens": "9459d1cf1e1e019c"
143
+ },
144
+ "truncated": 40,
145
+ "non_truncated": 0,
146
+ "padded": 29,
147
+ "non_padded": 11,
148
+ "effective_few_shots": 0.0,
149
+ "num_truncated_few_shots": 0
150
+ },
151
+ "custom|aimo_kaggle_medium_pot:v1|0": {
152
+ "hashes": {
153
+ "hash_examples": "806b2e2056b41f84",
154
+ "hash_full_prompts": "8123a0d96a6ceb9d",
155
+ "hash_input_tokens": "44f80b07cbbc1202",
156
+ "hash_cont_tokens": "e206d4ab5a1cd6c9"
157
+ },
158
+ "truncated": 40,
159
+ "non_truncated": 0,
160
+ "padded": 28,
161
+ "non_padded": 12,
162
+ "effective_few_shots": 0.0,
163
+ "num_truncated_few_shots": 0
164
+ },
165
+ "custom|aimo_kaggle_medium_pot:v2|0": {
166
+ "hashes": {
167
+ "hash_examples": "d8534375acc5d427",
168
+ "hash_full_prompts": "71ba7c8172fec45a",
169
+ "hash_input_tokens": "9fe3382a1eef295c",
170
+ "hash_cont_tokens": "1c6e52e061669f20"
171
+ },
172
+ "truncated": 40,
173
+ "non_truncated": 0,
174
+ "padded": 28,
175
+ "non_padded": 12,
176
+ "effective_few_shots": 0.0,
177
+ "num_truncated_few_shots": 0
178
+ }
179
+ },
180
+ "summary_general": {
181
+ "hashes": {
182
+ "hash_examples": "623505a45a4910c2",
183
+ "hash_full_prompts": "0ee7c8ef786b9aa3",
184
+ "hash_input_tokens": "89b21512945d231a",
185
+ "hash_cont_tokens": "c3443bc5dbcaf2a3"
186
+ },
187
+ "truncated": 120,
188
+ "non_truncated": 0,
189
+ "padded": 85,
190
+ "non_padded": 35,
191
+ "num_truncated_few_shots": 0
192
+ }
193
+ }