kashif HF Staff commited on
Commit
cac94ef
·
verified ·
1 Parent(s): f055f39

Upload eval_results/kashif/ppo_aimo_vllm_python_eval_warmup_1e-6_promising/main/aimo_kaggle_medium_pot/results_2024-05-30T07-42-24.609542.json with huggingface_hub

Browse files
eval_results/kashif/ppo_aimo_vllm_python_eval_warmup_1e-6_promising/main/aimo_kaggle_medium_pot/results_2024-05-30T07-42-24.609542.json ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 4,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 2311503.672642814,
9
+ "end_time": 2312121.703769456,
10
+ "total_evaluation_time_secondes": "618.031126642134",
11
+ "model_name": "kashif/ppo_aimo_vllm_python_eval_warmup_1e-6_promising",
12
+ "model_sha": "49f644b68c3b2d5708fdab238775ee13a8f93f19",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "12.93 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "custom|aimo_kaggle_medium_pot:v0|0": {
19
+ "qem": 0.025,
20
+ "qem_stderr": 0.024999999999999998
21
+ },
22
+ "custom|aimo_kaggle_medium_pot:v1|0": {
23
+ "qem": 0.0,
24
+ "qem_stderr": 0.0
25
+ },
26
+ "custom|aimo_kaggle_medium_pot:v2|0": {
27
+ "qem": 0.1,
28
+ "qem_stderr": 0.04803844614152613
29
+ },
30
+ "custom|aimo_kaggle_medium_pot:_average|0": {
31
+ "qem": 0.041666666666666664,
32
+ "qem_stderr": 0.024346148713842043
33
+ },
34
+ "all": {
35
+ "qem": 0.041666666666666664,
36
+ "qem_stderr": 0.024346148713842043
37
+ }
38
+ },
39
+ "versions": {
40
+ "custom|aimo_kaggle_medium_pot:v0|0": 0,
41
+ "custom|aimo_kaggle_medium_pot:v1|0": 0,
42
+ "custom|aimo_kaggle_medium_pot:v2|0": 0
43
+ },
44
+ "config_tasks": {
45
+ "custom|aimo_kaggle_medium_pot:v0": {
46
+ "name": "aimo_kaggle_medium_pot:v0",
47
+ "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
48
+ "hf_repo": "AI-MO/kaggle-validation-set-medium",
49
+ "hf_subset": "v0",
50
+ "metric": [
51
+ "quasi_exact_match_code_and_math"
52
+ ],
53
+ "hf_avail_splits": [
54
+ "train"
55
+ ],
56
+ "evaluation_splits": [
57
+ "train"
58
+ ],
59
+ "few_shots_split": null,
60
+ "few_shots_select": null,
61
+ "generation_size": 2048,
62
+ "stop_sequence": null,
63
+ "output_regex": null,
64
+ "frozen": false,
65
+ "suite": [
66
+ "custom"
67
+ ],
68
+ "original_num_docs": 40,
69
+ "effective_num_docs": 40,
70
+ "trust_dataset": null,
71
+ "must_remove_duplicate_docs": null
72
+ },
73
+ "custom|aimo_kaggle_medium_pot:v1": {
74
+ "name": "aimo_kaggle_medium_pot:v1",
75
+ "prompt_function": "kaggle_medium_pot_prompt_fn_v1",
76
+ "hf_repo": "AI-MO/kaggle-validation-set-medium",
77
+ "hf_subset": "v0",
78
+ "metric": [
79
+ "quasi_exact_match_code_and_math"
80
+ ],
81
+ "hf_avail_splits": [
82
+ "train"
83
+ ],
84
+ "evaluation_splits": [
85
+ "train"
86
+ ],
87
+ "few_shots_split": null,
88
+ "few_shots_select": null,
89
+ "generation_size": 2048,
90
+ "stop_sequence": null,
91
+ "output_regex": null,
92
+ "frozen": false,
93
+ "suite": [
94
+ "custom"
95
+ ],
96
+ "original_num_docs": 40,
97
+ "effective_num_docs": 40,
98
+ "trust_dataset": null,
99
+ "must_remove_duplicate_docs": null
100
+ },
101
+ "custom|aimo_kaggle_medium_pot:v2": {
102
+ "name": "aimo_kaggle_medium_pot:v2",
103
+ "prompt_function": "kaggle_medium_pot_prompt_fn_v2",
104
+ "hf_repo": "AI-MO/kaggle-validation-set-medium",
105
+ "hf_subset": "v0",
106
+ "metric": [
107
+ "quasi_exact_match_code_and_math"
108
+ ],
109
+ "hf_avail_splits": [
110
+ "train"
111
+ ],
112
+ "evaluation_splits": [
113
+ "train"
114
+ ],
115
+ "few_shots_split": null,
116
+ "few_shots_select": null,
117
+ "generation_size": 2048,
118
+ "stop_sequence": null,
119
+ "output_regex": null,
120
+ "frozen": false,
121
+ "suite": [
122
+ "custom"
123
+ ],
124
+ "original_num_docs": 40,
125
+ "effective_num_docs": 40,
126
+ "trust_dataset": null,
127
+ "must_remove_duplicate_docs": null
128
+ }
129
+ },
130
+ "summary_tasks": {
131
+ "custom|aimo_kaggle_medium_pot:v0|0": {
132
+ "hashes": {
133
+ "hash_examples": "2799c24461029dc3",
134
+ "hash_full_prompts": "de6572b914d5bba6",
135
+ "hash_input_tokens": "5af82ee284ccce29",
136
+ "hash_cont_tokens": "4503ac7fda3a6ba3"
137
+ },
138
+ "truncated": 40,
139
+ "non_truncated": 0,
140
+ "padded": 29,
141
+ "non_padded": 11,
142
+ "effective_few_shots": 0.0,
143
+ "num_truncated_few_shots": 0
144
+ },
145
+ "custom|aimo_kaggle_medium_pot:v1|0": {
146
+ "hashes": {
147
+ "hash_examples": "806b2e2056b41f84",
148
+ "hash_full_prompts": "427d5de6e4d90df2",
149
+ "hash_input_tokens": "be3dcf9f8a2350d0",
150
+ "hash_cont_tokens": "b6a85af8afdce578"
151
+ },
152
+ "truncated": 40,
153
+ "non_truncated": 0,
154
+ "padded": 26,
155
+ "non_padded": 14,
156
+ "effective_few_shots": 0.0,
157
+ "num_truncated_few_shots": 0
158
+ },
159
+ "custom|aimo_kaggle_medium_pot:v2|0": {
160
+ "hashes": {
161
+ "hash_examples": "d8534375acc5d427",
162
+ "hash_full_prompts": "6bb90c129d6d6123",
163
+ "hash_input_tokens": "22eeaedbeb3f56f0",
164
+ "hash_cont_tokens": "eec12e4536533876"
165
+ },
166
+ "truncated": 40,
167
+ "non_truncated": 0,
168
+ "padded": 31,
169
+ "non_padded": 9,
170
+ "effective_few_shots": 0.0,
171
+ "num_truncated_few_shots": 0
172
+ }
173
+ },
174
+ "summary_general": {
175
+ "hashes": {
176
+ "hash_examples": "623505a45a4910c2",
177
+ "hash_full_prompts": "3aca0fad61a42921",
178
+ "hash_input_tokens": "c7d2c8bb76f8ecb1",
179
+ "hash_cont_tokens": "3c78f4620fce6fa1"
180
+ },
181
+ "truncated": 120,
182
+ "non_truncated": 0,
183
+ "padded": 86,
184
+ "non_padded": 34,
185
+ "num_truncated_few_shots": 0
186
+ }
187
+ }