Delete eval_results
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle/results_2024-06-01T04-33-51.272361.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_hard/results_2024-06-01T04-41-15.206832.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_medium/results_2024-06-01T04-37-13.624182.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle/results_2024-06-02T19-57-15.384420.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_hard/results_2024-06-02T20-00-55.391068.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_medium/results_2024-06-02T19-58-49.662787.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle/results_2024-06-01T04-50-49.856519.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_hard/results_2024-06-01T04-52-36.232279.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_medium/results_2024-06-01T04-50-32.122163.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle/results_2024-06-02T11-36-26.669791.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_hard/results_2024-06-02T11-51-16.456597.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_medium/results_2024-06-02T11-43-17.517072.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle/results_2024-06-02T19-00-04.484770.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_hard/results_2024-06-02T19-02-41.193644.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_medium/results_2024-06-02T18-59-17.155994.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle/results_2024-06-02T19-46-44.922456.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_hard/results_2024-06-02T19-53-35.519132.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_medium/results_2024-06-02T19-47-54.814443.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle/results_2024-06-01T04-42-22.110150.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_hard/results_2024-06-01T04-50-33.000483.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_medium/results_2024-06-01T04-44-46.129536.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle/results_2024-06-01T05-55-00.892964.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_hard/results_2024-06-01T05-54-49.725003.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_medium/results_2024-06-01T05-53-12.777129.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle/results_2024-06-02T20-03-07.489443.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_hard/results_2024-06-02T20-04-32.429315.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_medium/results_2024-06-02T20-03-59.774755.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle/results_2024-06-01T21-23-46.264185.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_hard/results_2024-06-01T21-26-21.659630.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_medium/results_2024-06-01T21-24-03.954253.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle/results_2024-06-01T05-35-51.309986.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_hard/results_2024-06-01T05-37-20.266435.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_medium/results_2024-06-01T05-36-05.734157.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle/results_2024-06-01T05-45-43.684415.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_hard/results_2024-06-01T05-46-32.901937.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_medium/results_2024-06-01T05-46-09.737288.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle/results_2024-06-02T18-58-15.446870.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_hard/results_2024-06-02T18-58-01.233506.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_medium/results_2024-06-02T18-56-51.533786.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle/results_2024-06-01T21-45-15.801293.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_hard/results_2024-06-01T21-45-20.802107.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_medium/results_2024-06-01T21-44-37.236377.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle/results_2024-06-01T21-19-06.909857.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_hard/results_2024-06-01T21-23-15.431062.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_medium/results_2024-06-01T21-21-15.404900.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle/results_2024-06-02T18-50-16.244304.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_hard/results_2024-06-02T18-55-06.099176.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_medium/results_2024-06-02T18-53-27.832845.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle/results_2024-06-01T21-39-31.404005.json +0 -91
- eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle_hard/results_2024-06-01T21-43-22.900752.json +0 -91
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle/results_2024-06-01T04-33-51.272361.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 7330971.82432595,
|
9 |
-
"end_time": 7331385.7388414,
|
10 |
-
"total_evaluation_time_secondes": "413.9145154496655",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "a04357d1fbd0c17d86ec3aff74a5f6196ce49b8a",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999998
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999998
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "a7d90f3464fc3428"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "5834780a779d0112"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_hard/results_2024-06-01T04-41-15.206832.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 2914608.793842847,
|
9 |
-
"end_time": 2915055.215341104,
|
10 |
-
"total_evaluation_time_secondes": "446.42149825720116",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "a04357d1fbd0c17d86ec3aff74a5f6196ce49b8a",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "5a4ca24432d580a0"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "cac9942ec74ecf73"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_medium/results_2024-06-01T04-37-13.624182.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 2020.568049556,
|
9 |
-
"end_time": 2348.699219677,
|
10 |
-
"total_evaluation_time_secondes": "328.1311701210002",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "a04357d1fbd0c17d86ec3aff74a5f6196ce49b8a",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.05,
|
20 |
-
"qem_stderr": 0.03489912202260562
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.05,
|
24 |
-
"qem_stderr": 0.03489912202260562
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "6fadc9f927a1b164"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "132f6ddffd6c517b"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle/results_2024-06-02T19-57-15.384420.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 280489.090456043,
|
9 |
-
"end_time": 280743.345045138,
|
10 |
-
"total_evaluation_time_secondes": "254.254589094955",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "476392a1119138a234ac9ebb5cb9de3ad3f2c637",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.14,
|
20 |
-
"qem_stderr": 0.04956957592256421
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.14,
|
24 |
-
"qem_stderr": 0.04956957592256421
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "bcbff7ea8a042b8e"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "f4e1e6cc618093dc"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_hard/results_2024-06-02T20-00-55.391068.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 7728.073363246,
|
9 |
-
"end_time": 8024.43010322,
|
10 |
-
"total_evaluation_time_secondes": "296.35673997400045",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "476392a1119138a234ac9ebb5cb9de3ad3f2c637",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999998
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999998
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "c04ec37bbda4d8b2"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "fad45544c423ae77"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_medium/results_2024-06-02T19-58-49.662787.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 812559.720312176,
|
9 |
-
"end_time": 812808.702969616,
|
10 |
-
"total_evaluation_time_secondes": "248.98265744000673",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "476392a1119138a234ac9ebb5cb9de3ad3f2c637",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.15,
|
20 |
-
"qem_stderr": 0.05717718748968655
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.15,
|
24 |
-
"qem_stderr": 0.05717718748968655
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "170d64e143cf6996"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "da7df27fcd9b9bb1"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle/results_2024-06-01T04-50-49.856519.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 7331945.252107517,
|
9 |
-
"end_time": 7332404.323357863,
|
10 |
-
"total_evaluation_time_secondes": "459.07125034648925",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "fcb7fe46d55df8d593ea5cb04e46c01949542360",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.08,
|
20 |
-
"qem_stderr": 0.038756171332144415
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.08,
|
24 |
-
"qem_stderr": 0.038756171332144415
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "75baf1c260a5957f"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "e0a0c309948e7ef6"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_hard/results_2024-06-01T04-52-36.232279.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 2843.619818643,
|
9 |
-
"end_time": 3271.30727537,
|
10 |
-
"total_evaluation_time_secondes": "427.6874567269997",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "fcb7fe46d55df8d593ea5cb04e46c01949542360",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999998
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999998
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "98a32e6b1fbbd588"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "87ead9b0afc5ce14"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_medium/results_2024-06-01T04-50-32.122163.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 4455054.814072875,
|
9 |
-
"end_time": 4455474.057915317,
|
10 |
-
"total_evaluation_time_secondes": "419.24384244158864",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "fcb7fe46d55df8d593ea5cb04e46c01949542360",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.075,
|
20 |
-
"qem_stderr": 0.04217636961434869
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.075,
|
24 |
-
"qem_stderr": 0.04217636961434869
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "dc0b3975a4eba9fa"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "731eb081fd0d7aad"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle/results_2024-06-02T11-36-26.669791.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1863225.455479811,
|
9 |
-
"end_time": 1863567.627801796,
|
10 |
-
"total_evaluation_time_secondes": "342.17232198501006",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "0a2526a35e7f08f87c44b3c72e6967a772ac0df5",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.16,
|
20 |
-
"qem_stderr": 0.052372293656638154
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.16,
|
24 |
-
"qem_stderr": 0.052372293656638154
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "6278007048507ffd"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "92f875343d2ff506"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_hard/results_2024-06-02T11-51-16.456597.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1864038.522646221,
|
9 |
-
"end_time": 1864457.414652514,
|
10 |
-
"total_evaluation_time_secondes": "418.8920062929392",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "0a2526a35e7f08f87c44b3c72e6967a772ac0df5",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999998
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999998
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "dbfa4de9698673b9"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "3a76011d7952645f"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_medium/results_2024-06-02T11-43-17.517072.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1863632.549851687,
|
9 |
-
"end_time": 1863978.475093559,
|
10 |
-
"total_evaluation_time_secondes": "345.9252418719698",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "0a2526a35e7f08f87c44b3c72e6967a772ac0df5",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.125,
|
20 |
-
"qem_stderr": 0.05295740910852021
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.125,
|
24 |
-
"qem_stderr": 0.05295740910852021
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "b150afe6ebecd8a4"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "559308231098c15a"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle/results_2024-06-02T19-00-04.484770.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 808881.785721693,
|
9 |
-
"end_time": 809283.524944443,
|
10 |
-
"total_evaluation_time_secondes": "401.7392227500677",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "543fbff304cd3d8870b73625c4df56d0bfc62625",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.14,
|
20 |
-
"qem_stderr": 0.0495695759225642
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.14,
|
24 |
-
"qem_stderr": 0.0495695759225642
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "dbdda2820e4c8275"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "f9eb2e1b31a88440"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_hard/results_2024-06-02T19-02-41.193644.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 681239.591017876,
|
9 |
-
"end_time": 681639.172371238,
|
10 |
-
"total_evaluation_time_secondes": "399.5813533619512",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "543fbff304cd3d8870b73625c4df56d0bfc62625",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "6070b3faf401b19a"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "85f58fce9bbc4ee6"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_medium/results_2024-06-02T18-59-17.155994.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 406614.728272359,
|
9 |
-
"end_time": 406877.13152056,
|
10 |
-
"total_evaluation_time_secondes": "262.40324820100795",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "543fbff304cd3d8870b73625c4df56d0bfc62625",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.175,
|
20 |
-
"qem_stderr": 0.060843430844447564
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.175,
|
24 |
-
"qem_stderr": 0.060843430844447564
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "26524b9433a76f5c"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "dfb9879d03922d24"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle/results_2024-06-02T19-46-44.922456.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 811758.65911061,
|
9 |
-
"end_time": 812083.962635848,
|
10 |
-
"total_evaluation_time_secondes": "325.30352523794863",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "a29e4a89008bd02e08db5ccc1ee78b2d6876156c",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.16,
|
20 |
-
"qem_stderr": 0.05237229365663814
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.16,
|
24 |
-
"qem_stderr": 0.05237229365663814
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "e7249b9ccabf4cbb"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "a806236f114af8f4"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_hard/results_2024-06-02T19-53-35.519132.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 812134.32876156,
|
9 |
-
"end_time": 812494.559315884,
|
10 |
-
"total_evaluation_time_secondes": "360.2305543239927",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "a29e4a89008bd02e08db5ccc1ee78b2d6876156c",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999999
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999999
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "5612d03fc385210d"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "fd9f7a9cdfcdbe06"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_medium/results_2024-06-02T19-47-54.814443.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 409522.916265881,
|
9 |
-
"end_time": 409794.789955631,
|
10 |
-
"total_evaluation_time_secondes": "271.8736897500348",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "a29e4a89008bd02e08db5ccc1ee78b2d6876156c",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.04803844614152611
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.04803844614152611
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "cc1bb393cf773014"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "9562fbfdfdcab3c5"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle/results_2024-06-01T04-42-22.110150.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 7331438.041597982,
|
9 |
-
"end_time": 7331896.576992026,
|
10 |
-
"total_evaluation_time_secondes": "458.53539404366165",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "2c45107722078c8a46415bef66119a706c9faa79",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.18,
|
20 |
-
"qem_stderr": 0.054883922035138706
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.18,
|
24 |
-
"qem_stderr": 0.054883922035138706
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "bb851051e63db43e"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "f4957475e545e721"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_hard/results_2024-06-01T04-50-33.000483.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 2627042.726022085,
|
9 |
-
"end_time": 2627480.073880186,
|
10 |
-
"total_evaluation_time_secondes": "437.34785810066387",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "2c45107722078c8a46415bef66119a706c9faa79",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "38c1f1b38edace2d"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "7d20c45af46524f5"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_medium/results_2024-06-01T04-44-46.129536.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 2435.203182629,
|
9 |
-
"end_time": 2801.204239883,
|
10 |
-
"total_evaluation_time_secondes": "366.001057254",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "2c45107722078c8a46415bef66119a706c9faa79",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.04803844614152612
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.04803844614152612
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "001ea5cfe3a48cb3"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "0e69fba7a84b68a4"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle/results_2024-06-01T05-55-00.892964.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 364786.344403821,
|
9 |
-
"end_time": 365283.098496925,
|
10 |
-
"total_evaluation_time_secondes": "496.7540931040421",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "f22517dd2168b7ff4f7cfe3e1d60f70c344649a6",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.14,
|
20 |
-
"qem_stderr": 0.0495695759225642
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.14,
|
24 |
-
"qem_stderr": 0.0495695759225642
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "51505d8a5b5d26ac"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "06c20ce03294da85"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_hard/results_2024-06-01T05-54-49.725003.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 513317.423479873,
|
9 |
-
"end_time": 513769.887346333,
|
10 |
-
"total_evaluation_time_secondes": "452.46386646002065",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "f22517dd2168b7ff4f7cfe3e1d60f70c344649a6",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "4fefad91d499245e"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "01b8e231916449d9"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_medium/results_2024-06-01T05-53-12.777129.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 383692.500491469,
|
9 |
-
"end_time": 384068.635694825,
|
10 |
-
"total_evaluation_time_secondes": "376.1352033559815",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "f22517dd2168b7ff4f7cfe3e1d60f70c344649a6",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.175,
|
20 |
-
"qem_stderr": 0.060843430844447564
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.175,
|
24 |
-
"qem_stderr": 0.060843430844447564
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "c4694693bd1eb39c"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "8968ca60b87b47be"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle/results_2024-06-02T20-03-07.489443.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 280800.730639582,
|
9 |
-
"end_time": 281095.450109517,
|
10 |
-
"total_evaluation_time_secondes": "294.71946993505117",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "0e90404f514275e6bb51f04091fed122fd736403",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.2,
|
20 |
-
"qem_stderr": 0.057142857142857155
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.2,
|
24 |
-
"qem_stderr": 0.057142857142857155
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "85692fa117057e43"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "05e8e5f051938395"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_hard/results_2024-06-02T20-04-32.429315.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 812862.374495077,
|
9 |
-
"end_time": 813151.469469919,
|
10 |
-
"total_evaluation_time_secondes": "289.09497484203894",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "0e90404f514275e6bb51f04091fed122fd736403",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "7d75f5cff084d1f5"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "32adb50ac6b6a7ff"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_medium/results_2024-06-02T20-03-59.774755.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 12628.21789308,
|
9 |
-
"end_time": 12893.039860276,
|
10 |
-
"total_evaluation_time_secondes": "264.8219671959996",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "0e90404f514275e6bb51f04091fed122fd736403",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.04803844614152612
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.04803844614152612
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "f9f0938dc6f85f61"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "c28efa0385adfa41"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle/results_2024-06-01T21-23-46.264185.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 455783.289288206,
|
9 |
-
"end_time": 456110.027675772,
|
10 |
-
"total_evaluation_time_secondes": "326.7383875660016",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "8ec05aac45adfb5a9df7fabe87e4de5c1c4fa16d",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.16,
|
20 |
-
"qem_stderr": 0.052372293656638154
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.16,
|
24 |
-
"qem_stderr": 0.052372293656638154
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "05638f479f269c23"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "5d00f919d8ec0d12"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_hard/results_2024-06-01T21-26-21.659630.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1600669.946796879,
|
9 |
-
"end_time": 1601058.697372188,
|
10 |
-
"total_evaluation_time_secondes": "388.75057530915365",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "8ec05aac45adfb5a9df7fabe87e4de5c1c4fa16d",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "81a96368049ca333"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "48b4acdb2c0f3cfb"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_medium/results_2024-06-01T21-24-03.954253.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 968414.597008877,
|
9 |
-
"end_time": 968705.871528091,
|
10 |
-
"total_evaluation_time_secondes": "291.27451921405736",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "8ec05aac45adfb5a9df7fabe87e4de5c1c4fa16d",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.04803844614152612
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.04803844614152612
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "6655a6d5b1dc0ae9"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "853ddf635f0c19a1"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle/results_2024-06-01T05-35-51.309986.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 512226.496990626,
|
9 |
-
"end_time": 512631.472327661,
|
10 |
-
"total_evaluation_time_secondes": "404.9753370350227",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "dd7593bfb7012234d702164f013ec23279f82a09",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.042857142857142844
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.042857142857142844
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "7e7a71073ccafef1"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "7d094d3c18407805"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_hard/results_2024-06-01T05-37-20.266435.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 363736.36712355,
|
9 |
-
"end_time": 364222.472140237,
|
10 |
-
"total_evaluation_time_secondes": "486.1050166870118",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "dd7593bfb7012234d702164f013ec23279f82a09",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "bbc5187940f6d8b5"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "4c4cb8f8a03ab820"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_medium/results_2024-06-01T05-36-05.734157.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1358146.927236688,
|
9 |
-
"end_time": 1358579.924716579,
|
10 |
-
"total_evaluation_time_secondes": "432.99747989093885",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "dd7593bfb7012234d702164f013ec23279f82a09",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.04803844614152612
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.04803844614152612
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "3b39c49ddc8a8f83"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "837e53a7921a9d8a"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle/results_2024-06-01T05-45-43.684415.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 4458377.923526119,
|
9 |
-
"end_time": 4458785.620523886,
|
10 |
-
"total_evaluation_time_secondes": "407.696997766383",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "4e6a802bd21a15ad20ebf5bfb708cf43238d3ae9",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.24,
|
20 |
-
"qem_stderr": 0.0610118757258932
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.24,
|
24 |
-
"qem_stderr": 0.0610118757258932
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "98bba437c30d84b1"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "d476b6dd1d0400d9"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_hard/results_2024-06-01T05-46-32.901937.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 512933.009355462,
|
9 |
-
"end_time": 513273.064260089,
|
10 |
-
"total_evaluation_time_secondes": "340.05490462703165",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "4e6a802bd21a15ad20ebf5bfb708cf43238d3ae9",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.0,
|
20 |
-
"qem_stderr": 0.0
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.0,
|
24 |
-
"qem_stderr": 0.0
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "9d3d9bdbf7d51ea0"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "8bc399ff0429c9c8"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_medium/results_2024-06-01T05-46-09.737288.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 383326.713567976,
|
9 |
-
"end_time": 383645.595892471,
|
10 |
-
"total_evaluation_time_secondes": "318.88232449500356",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "4e6a802bd21a15ad20ebf5bfb708cf43238d3ae9",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.175,
|
20 |
-
"qem_stderr": 0.06084343084444758
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.175,
|
24 |
-
"qem_stderr": 0.06084343084444758
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "3770ddda90e0f72b"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "f9ab46bead8a1de0"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle/results_2024-06-02T18-58-15.446870.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 3825.7584337,
|
9 |
-
"end_time": 4264.485878867,
|
10 |
-
"total_evaluation_time_secondes": "438.72744516700004",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "811ca415e400b134946a0dbd631c76052d9c3209",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.22,
|
20 |
-
"qem_stderr": 0.05917804336345138
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.22,
|
24 |
-
"qem_stderr": 0.05917804336345138
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "df51d50f82a946ab"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "8d755a623d43cb16"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_hard/results_2024-06-02T18-58-01.233506.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 435064.413235468,
|
9 |
-
"end_time": 435372.093186668,
|
10 |
-
"total_evaluation_time_secondes": "307.6799512000289",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "811ca415e400b134946a0dbd631c76052d9c3209",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999998
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999998
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "d2799d15d04aa177"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "7173dd2dbcb47987"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_medium/results_2024-06-02T18-56-51.533786.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 276787.286176622,
|
9 |
-
"end_time": 277119.494513706,
|
10 |
-
"total_evaluation_time_secondes": "332.208337084041",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "811ca415e400b134946a0dbd631c76052d9c3209",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.04803844614152612
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.04803844614152612
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "12549c8b6c88e722"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "4afda0e2c7f0affc"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle/results_2024-06-01T21-45-15.801293.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 2687950.292268706,
|
9 |
-
"end_time": 2688362.874668186,
|
10 |
-
"total_evaluation_time_secondes": "412.58239948004484",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "da6be403dc96aa7d82ac72f7807a2317d25c956d",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.16,
|
20 |
-
"qem_stderr": 0.052372293656638154
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.16,
|
24 |
-
"qem_stderr": 0.052372293656638154
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "8ea917b45f93244d"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "0922e6daa54d396e"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_hard/results_2024-06-01T21-45-20.802107.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 440779.073690891,
|
9 |
-
"end_time": 441196.660713896,
|
10 |
-
"total_evaluation_time_secondes": "417.5870230050059",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "da6be403dc96aa7d82ac72f7807a2317d25c956d",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.019999999999999987
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.019999999999999987
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "33444874bc125b34"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "b912019317de1727"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_medium/results_2024-06-01T21-44-37.236377.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 894361.517640607,
|
9 |
-
"end_time": 894757.469297242,
|
10 |
-
"total_evaluation_time_secondes": "395.95165663503576",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "da6be403dc96aa7d82ac72f7807a2317d25c956d",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.225,
|
20 |
-
"qem_stderr": 0.06686668711812967
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.225,
|
24 |
-
"qem_stderr": 0.06686668711812967
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "3648cd106873d142"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "8b6f7543774b3e8b"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle/results_2024-06-01T21-19-06.909857.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1600323.90425634,
|
9 |
-
"end_time": 1600623.947537335,
|
10 |
-
"total_evaluation_time_secondes": "300.04328099498525",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "894b60bbafd1de6b91cd582ad8015544ed8bf4ad",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.16,
|
20 |
-
"qem_stderr": 0.052372293656638154
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.16,
|
24 |
-
"qem_stderr": 0.052372293656638154
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "fe6d63cab6bd11fa"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "1e410755dc9fd305"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_hard/results_2024-06-01T21-23-15.431062.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1415019.132282249,
|
9 |
-
"end_time": 1415409.621590862,
|
10 |
-
"total_evaluation_time_secondes": "390.48930861311965",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "894b60bbafd1de6b91cd582ad8015544ed8bf4ad",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999998
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999998
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "a45f6fe2abfcdc12"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "b47a38415b441e77"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_medium/results_2024-06-01T21-21-15.404900.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 892991.386037297,
|
9 |
-
"end_time": 893355.6378942,
|
10 |
-
"total_evaluation_time_secondes": "364.2518569030799",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "894b60bbafd1de6b91cd582ad8015544ed8bf4ad",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.2,
|
20 |
-
"qem_stderr": 0.06405126152203487
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.2,
|
24 |
-
"qem_stderr": 0.06405126152203487
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "3f7b84c283243aa2"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "1ff5a5fbaaccab2f"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle/results_2024-06-02T18-50-16.244304.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 276485.929851092,
|
9 |
-
"end_time": 276724.204976521,
|
10 |
-
"total_evaluation_time_secondes": "238.27512542903423",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "5803ef46ffcbaf9680dbe32c261bf2ce1f32f1c3",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.08,
|
20 |
-
"qem_stderr": 0.03875617133214439
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.08,
|
24 |
-
"qem_stderr": 0.03875617133214439
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "3d1548c398e66753"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "2d552d90e907933d"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_hard/results_2024-06-02T18-55-06.099176.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 517062.738187587,
|
9 |
-
"end_time": 517381.957801859,
|
10 |
-
"total_evaluation_time_secondes": "319.21961427200586",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "5803ef46ffcbaf9680dbe32c261bf2ce1f32f1c3",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.01999999999999998
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.01999999999999998
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "4ddc90bc9d802107"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "93760eb0e8c49d67"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_medium/results_2024-06-02T18-53-27.832845.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 406197.396604711,
|
9 |
-
"end_time": 406527.808383541,
|
10 |
-
"total_evaluation_time_secondes": "330.4117788299918",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "5803ef46ffcbaf9680dbe32c261bf2ce1f32f1c3",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
19 |
-
"qem": 0.1,
|
20 |
-
"qem_stderr": 0.04803844614152612
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.1,
|
24 |
-
"qem_stderr": 0.04803844614152612
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_medium:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_medium:v0": {
|
32 |
-
"name": "aimo_kaggle_medium:v0",
|
33 |
-
"prompt_function": "kaggle_medium_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-medium",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 40,
|
56 |
-
"effective_num_docs": 40,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_medium:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "3401efda8b0cbcb5",
|
66 |
-
"hash_full_prompts": "14cb6646c78f810c",
|
67 |
-
"hash_input_tokens": "d1d016040eb15856",
|
68 |
-
"hash_cont_tokens": "47262795fd1ed671"
|
69 |
-
},
|
70 |
-
"truncated": 40,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 32,
|
73 |
-
"non_padded": 8,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "4c81d27cfdb9d737",
|
81 |
-
"hash_full_prompts": "0d4787c840fc98b7",
|
82 |
-
"hash_input_tokens": "7e52a8d7f3e15dcc",
|
83 |
-
"hash_cont_tokens": "c12a9376b8d939d5"
|
84 |
-
},
|
85 |
-
"truncated": 40,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 32,
|
88 |
-
"non_padded": 8,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle/results_2024-06-01T21-39-31.404005.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 1601532.508414207,
|
9 |
-
"end_time": 1601848.441672974,
|
10 |
-
"total_evaluation_time_secondes": "315.9332587670069",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "d4222e8c590d6c31fda1b4b63ed5dbe6a28a1f0c",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle:v0|0": {
|
19 |
-
"qem": 0.16,
|
20 |
-
"qem_stderr": 0.052372293656638154
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.16,
|
24 |
-
"qem_stderr": 0.052372293656638154
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle:v0": {
|
32 |
-
"name": "aimo_kaggle:v0",
|
33 |
-
"prompt_function": "kaggle_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "2a919e6b839e921a",
|
66 |
-
"hash_full_prompts": "e0424ade9e31a0fe",
|
67 |
-
"hash_input_tokens": "437c7d81b73a6e7a",
|
68 |
-
"hash_cont_tokens": "c0a5c2dbd8d43de0"
|
69 |
-
},
|
70 |
-
"truncated": 47,
|
71 |
-
"non_truncated": 3,
|
72 |
-
"padded": 34,
|
73 |
-
"non_padded": 16,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "f59ec6c4e24c1226",
|
81 |
-
"hash_full_prompts": "35e1e25418ccb6f7",
|
82 |
-
"hash_input_tokens": "c227167cd34ee1ce",
|
83 |
-
"hash_cont_tokens": "ec9051d94c2aad79"
|
84 |
-
},
|
85 |
-
"truncated": 47,
|
86 |
-
"non_truncated": 3,
|
87 |
-
"padded": 34,
|
88 |
-
"non_padded": 16,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle_hard/results_2024-06-01T21-43-22.900752.json
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config_general": {
|
3 |
-
"lighteval_sha": "?",
|
4 |
-
"num_fewshot_seeds": 1,
|
5 |
-
"override_batch_size": 4,
|
6 |
-
"max_samples": null,
|
7 |
-
"job_id": "",
|
8 |
-
"start_time": 542430.398596583,
|
9 |
-
"end_time": 542785.055968121,
|
10 |
-
"total_evaluation_time_secondes": "354.65737153799273",
|
11 |
-
"model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
|
12 |
-
"model_sha": "d4222e8c590d6c31fda1b4b63ed5dbe6a28a1f0c",
|
13 |
-
"model_dtype": "torch.bfloat16",
|
14 |
-
"model_size": "24.56 GB",
|
15 |
-
"config": null
|
16 |
-
},
|
17 |
-
"results": {
|
18 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
19 |
-
"qem": 0.02,
|
20 |
-
"qem_stderr": 0.019999999999999976
|
21 |
-
},
|
22 |
-
"all": {
|
23 |
-
"qem": 0.02,
|
24 |
-
"qem_stderr": 0.019999999999999976
|
25 |
-
}
|
26 |
-
},
|
27 |
-
"versions": {
|
28 |
-
"custom|aimo_kaggle_hard:v0|0": 0
|
29 |
-
},
|
30 |
-
"config_tasks": {
|
31 |
-
"custom|aimo_kaggle_hard:v0": {
|
32 |
-
"name": "aimo_kaggle_hard:v0",
|
33 |
-
"prompt_function": "kaggle_hard_prompt_fn",
|
34 |
-
"hf_repo": "AI-MO/kaggle-validation-set-hard",
|
35 |
-
"hf_subset": "v0",
|
36 |
-
"metric": [
|
37 |
-
"quasi_exact_match_math"
|
38 |
-
],
|
39 |
-
"hf_avail_splits": [
|
40 |
-
"train"
|
41 |
-
],
|
42 |
-
"evaluation_splits": [
|
43 |
-
"train"
|
44 |
-
],
|
45 |
-
"few_shots_split": null,
|
46 |
-
"few_shots_select": null,
|
47 |
-
"generation_size": 2048,
|
48 |
-
"stop_sequence": null,
|
49 |
-
"output_regex": null,
|
50 |
-
"num_samples": null,
|
51 |
-
"frozen": false,
|
52 |
-
"suite": [
|
53 |
-
"custom"
|
54 |
-
],
|
55 |
-
"original_num_docs": 50,
|
56 |
-
"effective_num_docs": 50,
|
57 |
-
"trust_dataset": null,
|
58 |
-
"must_remove_duplicate_docs": null,
|
59 |
-
"version": 0
|
60 |
-
}
|
61 |
-
},
|
62 |
-
"summary_tasks": {
|
63 |
-
"custom|aimo_kaggle_hard:v0|0": {
|
64 |
-
"hashes": {
|
65 |
-
"hash_examples": "b40b6a493a95bf77",
|
66 |
-
"hash_full_prompts": "2c0f18269c13af34",
|
67 |
-
"hash_input_tokens": "b477f3b96ee9e7ba",
|
68 |
-
"hash_cont_tokens": "685d74fcb7bc0954"
|
69 |
-
},
|
70 |
-
"truncated": 50,
|
71 |
-
"non_truncated": 0,
|
72 |
-
"padded": 37,
|
73 |
-
"non_padded": 13,
|
74 |
-
"effective_few_shots": 0.0,
|
75 |
-
"num_truncated_few_shots": 0
|
76 |
-
}
|
77 |
-
},
|
78 |
-
"summary_general": {
|
79 |
-
"hashes": {
|
80 |
-
"hash_examples": "79dbebcff6acad9e",
|
81 |
-
"hash_full_prompts": "739e6bdec3671fc6",
|
82 |
-
"hash_input_tokens": "16e2e8b5ca238d88",
|
83 |
-
"hash_cont_tokens": "9ddec76b260758c3"
|
84 |
-
},
|
85 |
-
"truncated": 50,
|
86 |
-
"non_truncated": 0,
|
87 |
-
"padded": 37,
|
88 |
-
"non_padded": 13,
|
89 |
-
"num_truncated_few_shots": 0
|
90 |
-
}
|
91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|