edbeeching HF Staff commited on
Commit
82185cf
·
verified ·
1 Parent(s): 38a75ac

Delete eval_results

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle/results_2024-06-01T04-33-51.272361.json +0 -91
  2. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_hard/results_2024-06-01T04-41-15.206832.json +0 -91
  3. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_medium/results_2024-06-01T04-37-13.624182.json +0 -91
  4. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle/results_2024-06-02T19-57-15.384420.json +0 -91
  5. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_hard/results_2024-06-02T20-00-55.391068.json +0 -91
  6. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_medium/results_2024-06-02T19-58-49.662787.json +0 -91
  7. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle/results_2024-06-01T04-50-49.856519.json +0 -91
  8. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_hard/results_2024-06-01T04-52-36.232279.json +0 -91
  9. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_medium/results_2024-06-01T04-50-32.122163.json +0 -91
  10. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle/results_2024-06-02T11-36-26.669791.json +0 -91
  11. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_hard/results_2024-06-02T11-51-16.456597.json +0 -91
  12. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_medium/results_2024-06-02T11-43-17.517072.json +0 -91
  13. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle/results_2024-06-02T19-00-04.484770.json +0 -91
  14. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_hard/results_2024-06-02T19-02-41.193644.json +0 -91
  15. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_medium/results_2024-06-02T18-59-17.155994.json +0 -91
  16. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle/results_2024-06-02T19-46-44.922456.json +0 -91
  17. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_hard/results_2024-06-02T19-53-35.519132.json +0 -91
  18. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_medium/results_2024-06-02T19-47-54.814443.json +0 -91
  19. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle/results_2024-06-01T04-42-22.110150.json +0 -91
  20. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_hard/results_2024-06-01T04-50-33.000483.json +0 -91
  21. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_medium/results_2024-06-01T04-44-46.129536.json +0 -91
  22. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle/results_2024-06-01T05-55-00.892964.json +0 -91
  23. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_hard/results_2024-06-01T05-54-49.725003.json +0 -91
  24. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_medium/results_2024-06-01T05-53-12.777129.json +0 -91
  25. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle/results_2024-06-02T20-03-07.489443.json +0 -91
  26. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_hard/results_2024-06-02T20-04-32.429315.json +0 -91
  27. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_medium/results_2024-06-02T20-03-59.774755.json +0 -91
  28. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle/results_2024-06-01T21-23-46.264185.json +0 -91
  29. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_hard/results_2024-06-01T21-26-21.659630.json +0 -91
  30. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_medium/results_2024-06-01T21-24-03.954253.json +0 -91
  31. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle/results_2024-06-01T05-35-51.309986.json +0 -91
  32. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_hard/results_2024-06-01T05-37-20.266435.json +0 -91
  33. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_medium/results_2024-06-01T05-36-05.734157.json +0 -91
  34. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle/results_2024-06-01T05-45-43.684415.json +0 -91
  35. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_hard/results_2024-06-01T05-46-32.901937.json +0 -91
  36. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_medium/results_2024-06-01T05-46-09.737288.json +0 -91
  37. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle/results_2024-06-02T18-58-15.446870.json +0 -91
  38. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_hard/results_2024-06-02T18-58-01.233506.json +0 -91
  39. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_medium/results_2024-06-02T18-56-51.533786.json +0 -91
  40. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle/results_2024-06-01T21-45-15.801293.json +0 -91
  41. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_hard/results_2024-06-01T21-45-20.802107.json +0 -91
  42. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_medium/results_2024-06-01T21-44-37.236377.json +0 -91
  43. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle/results_2024-06-01T21-19-06.909857.json +0 -91
  44. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_hard/results_2024-06-01T21-23-15.431062.json +0 -91
  45. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_medium/results_2024-06-01T21-21-15.404900.json +0 -91
  46. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle/results_2024-06-02T18-50-16.244304.json +0 -91
  47. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_hard/results_2024-06-02T18-55-06.099176.json +0 -91
  48. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_medium/results_2024-06-02T18-53-27.832845.json +0 -91
  49. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle/results_2024-06-01T21-39-31.404005.json +0 -91
  50. eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle_hard/results_2024-06-01T21-43-22.900752.json +0 -91
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle/results_2024-06-01T04-33-51.272361.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 7330971.82432595,
9
- "end_time": 7331385.7388414,
10
- "total_evaluation_time_secondes": "413.9145154496655",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "a04357d1fbd0c17d86ec3aff74a5f6196ce49b8a",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999998
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "a7d90f3464fc3428"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "5834780a779d0112"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_hard/results_2024-06-01T04-41-15.206832.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2914608.793842847,
9
- "end_time": 2915055.215341104,
10
- "total_evaluation_time_secondes": "446.42149825720116",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "a04357d1fbd0c17d86ec3aff74a5f6196ce49b8a",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "5a4ca24432d580a0"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "cac9942ec74ecf73"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.00/aimo_kaggle_medium/results_2024-06-01T04-37-13.624182.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2020.568049556,
9
- "end_time": 2348.699219677,
10
- "total_evaluation_time_secondes": "328.1311701210002",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "a04357d1fbd0c17d86ec3aff74a5f6196ce49b8a",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.05,
20
- "qem_stderr": 0.03489912202260562
21
- },
22
- "all": {
23
- "qem": 0.05,
24
- "qem_stderr": 0.03489912202260562
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "6fadc9f927a1b164"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "132f6ddffd6c517b"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle/results_2024-06-02T19-57-15.384420.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 280489.090456043,
9
- "end_time": 280743.345045138,
10
- "total_evaluation_time_secondes": "254.254589094955",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "476392a1119138a234ac9ebb5cb9de3ad3f2c637",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.14,
20
- "qem_stderr": 0.04956957592256421
21
- },
22
- "all": {
23
- "qem": 0.14,
24
- "qem_stderr": 0.04956957592256421
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "bcbff7ea8a042b8e"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "f4e1e6cc618093dc"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_hard/results_2024-06-02T20-00-55.391068.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 7728.073363246,
9
- "end_time": 8024.43010322,
10
- "total_evaluation_time_secondes": "296.35673997400045",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "476392a1119138a234ac9ebb5cb9de3ad3f2c637",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999998
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "c04ec37bbda4d8b2"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "fad45544c423ae77"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.10/aimo_kaggle_medium/results_2024-06-02T19-58-49.662787.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 812559.720312176,
9
- "end_time": 812808.702969616,
10
- "total_evaluation_time_secondes": "248.98265744000673",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "476392a1119138a234ac9ebb5cb9de3ad3f2c637",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.15,
20
- "qem_stderr": 0.05717718748968655
21
- },
22
- "all": {
23
- "qem": 0.15,
24
- "qem_stderr": 0.05717718748968655
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "170d64e143cf6996"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "da7df27fcd9b9bb1"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle/results_2024-06-01T04-50-49.856519.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 7331945.252107517,
9
- "end_time": 7332404.323357863,
10
- "total_evaluation_time_secondes": "459.07125034648925",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "fcb7fe46d55df8d593ea5cb04e46c01949542360",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.08,
20
- "qem_stderr": 0.038756171332144415
21
- },
22
- "all": {
23
- "qem": 0.08,
24
- "qem_stderr": 0.038756171332144415
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "75baf1c260a5957f"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "e0a0c309948e7ef6"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_hard/results_2024-06-01T04-52-36.232279.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2843.619818643,
9
- "end_time": 3271.30727537,
10
- "total_evaluation_time_secondes": "427.6874567269997",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "fcb7fe46d55df8d593ea5cb04e46c01949542360",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999998
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "98a32e6b1fbbd588"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "87ead9b0afc5ce14"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.11/aimo_kaggle_medium/results_2024-06-01T04-50-32.122163.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 4455054.814072875,
9
- "end_time": 4455474.057915317,
10
- "total_evaluation_time_secondes": "419.24384244158864",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "fcb7fe46d55df8d593ea5cb04e46c01949542360",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.075,
20
- "qem_stderr": 0.04217636961434869
21
- },
22
- "all": {
23
- "qem": 0.075,
24
- "qem_stderr": 0.04217636961434869
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "dc0b3975a4eba9fa"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "731eb081fd0d7aad"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle/results_2024-06-02T11-36-26.669791.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1863225.455479811,
9
- "end_time": 1863567.627801796,
10
- "total_evaluation_time_secondes": "342.17232198501006",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "0a2526a35e7f08f87c44b3c72e6967a772ac0df5",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.16,
20
- "qem_stderr": 0.052372293656638154
21
- },
22
- "all": {
23
- "qem": 0.16,
24
- "qem_stderr": 0.052372293656638154
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "6278007048507ffd"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "92f875343d2ff506"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_hard/results_2024-06-02T11-51-16.456597.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1864038.522646221,
9
- "end_time": 1864457.414652514,
10
- "total_evaluation_time_secondes": "418.8920062929392",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "0a2526a35e7f08f87c44b3c72e6967a772ac0df5",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999998
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "dbfa4de9698673b9"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "3a76011d7952645f"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.12/aimo_kaggle_medium/results_2024-06-02T11-43-17.517072.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1863632.549851687,
9
- "end_time": 1863978.475093559,
10
- "total_evaluation_time_secondes": "345.9252418719698",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "0a2526a35e7f08f87c44b3c72e6967a772ac0df5",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.125,
20
- "qem_stderr": 0.05295740910852021
21
- },
22
- "all": {
23
- "qem": 0.125,
24
- "qem_stderr": 0.05295740910852021
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "b150afe6ebecd8a4"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "559308231098c15a"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle/results_2024-06-02T19-00-04.484770.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 808881.785721693,
9
- "end_time": 809283.524944443,
10
- "total_evaluation_time_secondes": "401.7392227500677",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "543fbff304cd3d8870b73625c4df56d0bfc62625",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.14,
20
- "qem_stderr": 0.0495695759225642
21
- },
22
- "all": {
23
- "qem": 0.14,
24
- "qem_stderr": 0.0495695759225642
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "dbdda2820e4c8275"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "f9eb2e1b31a88440"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_hard/results_2024-06-02T19-02-41.193644.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 681239.591017876,
9
- "end_time": 681639.172371238,
10
- "total_evaluation_time_secondes": "399.5813533619512",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "543fbff304cd3d8870b73625c4df56d0bfc62625",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "6070b3faf401b19a"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "85f58fce9bbc4ee6"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.13/aimo_kaggle_medium/results_2024-06-02T18-59-17.155994.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 406614.728272359,
9
- "end_time": 406877.13152056,
10
- "total_evaluation_time_secondes": "262.40324820100795",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "543fbff304cd3d8870b73625c4df56d0bfc62625",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.175,
20
- "qem_stderr": 0.060843430844447564
21
- },
22
- "all": {
23
- "qem": 0.175,
24
- "qem_stderr": 0.060843430844447564
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "26524b9433a76f5c"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "dfb9879d03922d24"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle/results_2024-06-02T19-46-44.922456.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 811758.65911061,
9
- "end_time": 812083.962635848,
10
- "total_evaluation_time_secondes": "325.30352523794863",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "a29e4a89008bd02e08db5ccc1ee78b2d6876156c",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.16,
20
- "qem_stderr": 0.05237229365663814
21
- },
22
- "all": {
23
- "qem": 0.16,
24
- "qem_stderr": 0.05237229365663814
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "e7249b9ccabf4cbb"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "a806236f114af8f4"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_hard/results_2024-06-02T19-53-35.519132.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 812134.32876156,
9
- "end_time": 812494.559315884,
10
- "total_evaluation_time_secondes": "360.2305543239927",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "a29e4a89008bd02e08db5ccc1ee78b2d6876156c",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999999
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999999
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "5612d03fc385210d"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "fd9f7a9cdfcdbe06"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.14/aimo_kaggle_medium/results_2024-06-02T19-47-54.814443.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 409522.916265881,
9
- "end_time": 409794.789955631,
10
- "total_evaluation_time_secondes": "271.8736897500348",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "a29e4a89008bd02e08db5ccc1ee78b2d6876156c",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.04803844614152611
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.04803844614152611
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "cc1bb393cf773014"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "9562fbfdfdcab3c5"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle/results_2024-06-01T04-42-22.110150.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 7331438.041597982,
9
- "end_time": 7331896.576992026,
10
- "total_evaluation_time_secondes": "458.53539404366165",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "2c45107722078c8a46415bef66119a706c9faa79",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.18,
20
- "qem_stderr": 0.054883922035138706
21
- },
22
- "all": {
23
- "qem": 0.18,
24
- "qem_stderr": 0.054883922035138706
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "bb851051e63db43e"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "f4957475e545e721"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_hard/results_2024-06-01T04-50-33.000483.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2627042.726022085,
9
- "end_time": 2627480.073880186,
10
- "total_evaluation_time_secondes": "437.34785810066387",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "2c45107722078c8a46415bef66119a706c9faa79",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "38c1f1b38edace2d"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "7d20c45af46524f5"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.16/aimo_kaggle_medium/results_2024-06-01T04-44-46.129536.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2435.203182629,
9
- "end_time": 2801.204239883,
10
- "total_evaluation_time_secondes": "366.001057254",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "2c45107722078c8a46415bef66119a706c9faa79",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.04803844614152612
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.04803844614152612
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "001ea5cfe3a48cb3"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "0e69fba7a84b68a4"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle/results_2024-06-01T05-55-00.892964.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 364786.344403821,
9
- "end_time": 365283.098496925,
10
- "total_evaluation_time_secondes": "496.7540931040421",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "f22517dd2168b7ff4f7cfe3e1d60f70c344649a6",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.14,
20
- "qem_stderr": 0.0495695759225642
21
- },
22
- "all": {
23
- "qem": 0.14,
24
- "qem_stderr": 0.0495695759225642
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "51505d8a5b5d26ac"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "06c20ce03294da85"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_hard/results_2024-06-01T05-54-49.725003.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 513317.423479873,
9
- "end_time": 513769.887346333,
10
- "total_evaluation_time_secondes": "452.46386646002065",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "f22517dd2168b7ff4f7cfe3e1d60f70c344649a6",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "4fefad91d499245e"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "01b8e231916449d9"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.17/aimo_kaggle_medium/results_2024-06-01T05-53-12.777129.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 383692.500491469,
9
- "end_time": 384068.635694825,
10
- "total_evaluation_time_secondes": "376.1352033559815",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "f22517dd2168b7ff4f7cfe3e1d60f70c344649a6",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.175,
20
- "qem_stderr": 0.060843430844447564
21
- },
22
- "all": {
23
- "qem": 0.175,
24
- "qem_stderr": 0.060843430844447564
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "c4694693bd1eb39c"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "8968ca60b87b47be"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle/results_2024-06-02T20-03-07.489443.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 280800.730639582,
9
- "end_time": 281095.450109517,
10
- "total_evaluation_time_secondes": "294.71946993505117",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "0e90404f514275e6bb51f04091fed122fd736403",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.2,
20
- "qem_stderr": 0.057142857142857155
21
- },
22
- "all": {
23
- "qem": 0.2,
24
- "qem_stderr": 0.057142857142857155
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "85692fa117057e43"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "05e8e5f051938395"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_hard/results_2024-06-02T20-04-32.429315.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 812862.374495077,
9
- "end_time": 813151.469469919,
10
- "total_evaluation_time_secondes": "289.09497484203894",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "0e90404f514275e6bb51f04091fed122fd736403",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "7d75f5cff084d1f5"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "32adb50ac6b6a7ff"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.18/aimo_kaggle_medium/results_2024-06-02T20-03-59.774755.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 12628.21789308,
9
- "end_time": 12893.039860276,
10
- "total_evaluation_time_secondes": "264.8219671959996",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "0e90404f514275e6bb51f04091fed122fd736403",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.04803844614152612
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.04803844614152612
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "f9f0938dc6f85f61"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "c28efa0385adfa41"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle/results_2024-06-01T21-23-46.264185.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 455783.289288206,
9
- "end_time": 456110.027675772,
10
- "total_evaluation_time_secondes": "326.7383875660016",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "8ec05aac45adfb5a9df7fabe87e4de5c1c4fa16d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.16,
20
- "qem_stderr": 0.052372293656638154
21
- },
22
- "all": {
23
- "qem": 0.16,
24
- "qem_stderr": 0.052372293656638154
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "05638f479f269c23"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "5d00f919d8ec0d12"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_hard/results_2024-06-01T21-26-21.659630.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1600669.946796879,
9
- "end_time": 1601058.697372188,
10
- "total_evaluation_time_secondes": "388.75057530915365",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "8ec05aac45adfb5a9df7fabe87e4de5c1c4fa16d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "81a96368049ca333"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "48b4acdb2c0f3cfb"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.19/aimo_kaggle_medium/results_2024-06-01T21-24-03.954253.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 968414.597008877,
9
- "end_time": 968705.871528091,
10
- "total_evaluation_time_secondes": "291.27451921405736",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "8ec05aac45adfb5a9df7fabe87e4de5c1c4fa16d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.04803844614152612
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.04803844614152612
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "6655a6d5b1dc0ae9"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "853ddf635f0c19a1"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle/results_2024-06-01T05-35-51.309986.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 512226.496990626,
9
- "end_time": 512631.472327661,
10
- "total_evaluation_time_secondes": "404.9753370350227",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "dd7593bfb7012234d702164f013ec23279f82a09",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.042857142857142844
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.042857142857142844
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "7e7a71073ccafef1"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "7d094d3c18407805"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_hard/results_2024-06-01T05-37-20.266435.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 363736.36712355,
9
- "end_time": 364222.472140237,
10
- "total_evaluation_time_secondes": "486.1050166870118",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "dd7593bfb7012234d702164f013ec23279f82a09",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "bbc5187940f6d8b5"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "4c4cb8f8a03ab820"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.20/aimo_kaggle_medium/results_2024-06-01T05-36-05.734157.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1358146.927236688,
9
- "end_time": 1358579.924716579,
10
- "total_evaluation_time_secondes": "432.99747989093885",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "dd7593bfb7012234d702164f013ec23279f82a09",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.04803844614152612
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.04803844614152612
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "3b39c49ddc8a8f83"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "837e53a7921a9d8a"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle/results_2024-06-01T05-45-43.684415.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 4458377.923526119,
9
- "end_time": 4458785.620523886,
10
- "total_evaluation_time_secondes": "407.696997766383",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "4e6a802bd21a15ad20ebf5bfb708cf43238d3ae9",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.24,
20
- "qem_stderr": 0.0610118757258932
21
- },
22
- "all": {
23
- "qem": 0.24,
24
- "qem_stderr": 0.0610118757258932
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "98bba437c30d84b1"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "d476b6dd1d0400d9"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_hard/results_2024-06-01T05-46-32.901937.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 512933.009355462,
9
- "end_time": 513273.064260089,
10
- "total_evaluation_time_secondes": "340.05490462703165",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "4e6a802bd21a15ad20ebf5bfb708cf43238d3ae9",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.0,
20
- "qem_stderr": 0.0
21
- },
22
- "all": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "9d3d9bdbf7d51ea0"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "8bc399ff0429c9c8"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.21/aimo_kaggle_medium/results_2024-06-01T05-46-09.737288.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 383326.713567976,
9
- "end_time": 383645.595892471,
10
- "total_evaluation_time_secondes": "318.88232449500356",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "4e6a802bd21a15ad20ebf5bfb708cf43238d3ae9",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.175,
20
- "qem_stderr": 0.06084343084444758
21
- },
22
- "all": {
23
- "qem": 0.175,
24
- "qem_stderr": 0.06084343084444758
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "3770ddda90e0f72b"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "f9ab46bead8a1de0"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle/results_2024-06-02T18-58-15.446870.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 3825.7584337,
9
- "end_time": 4264.485878867,
10
- "total_evaluation_time_secondes": "438.72744516700004",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "811ca415e400b134946a0dbd631c76052d9c3209",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.22,
20
- "qem_stderr": 0.05917804336345138
21
- },
22
- "all": {
23
- "qem": 0.22,
24
- "qem_stderr": 0.05917804336345138
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "df51d50f82a946ab"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "8d755a623d43cb16"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_hard/results_2024-06-02T18-58-01.233506.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 435064.413235468,
9
- "end_time": 435372.093186668,
10
- "total_evaluation_time_secondes": "307.6799512000289",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "811ca415e400b134946a0dbd631c76052d9c3209",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999998
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "d2799d15d04aa177"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "7173dd2dbcb47987"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.22/aimo_kaggle_medium/results_2024-06-02T18-56-51.533786.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 276787.286176622,
9
- "end_time": 277119.494513706,
10
- "total_evaluation_time_secondes": "332.208337084041",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "811ca415e400b134946a0dbd631c76052d9c3209",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.04803844614152612
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.04803844614152612
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "12549c8b6c88e722"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "4afda0e2c7f0affc"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle/results_2024-06-01T21-45-15.801293.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2687950.292268706,
9
- "end_time": 2688362.874668186,
10
- "total_evaluation_time_secondes": "412.58239948004484",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "da6be403dc96aa7d82ac72f7807a2317d25c956d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.16,
20
- "qem_stderr": 0.052372293656638154
21
- },
22
- "all": {
23
- "qem": 0.16,
24
- "qem_stderr": 0.052372293656638154
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "8ea917b45f93244d"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "0922e6daa54d396e"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_hard/results_2024-06-01T21-45-20.802107.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 440779.073690891,
9
- "end_time": 441196.660713896,
10
- "total_evaluation_time_secondes": "417.5870230050059",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "da6be403dc96aa7d82ac72f7807a2317d25c956d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.019999999999999987
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.019999999999999987
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "33444874bc125b34"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "b912019317de1727"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.23/aimo_kaggle_medium/results_2024-06-01T21-44-37.236377.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 894361.517640607,
9
- "end_time": 894757.469297242,
10
- "total_evaluation_time_secondes": "395.95165663503576",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "da6be403dc96aa7d82ac72f7807a2317d25c956d",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.225,
20
- "qem_stderr": 0.06686668711812967
21
- },
22
- "all": {
23
- "qem": 0.225,
24
- "qem_stderr": 0.06686668711812967
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "3648cd106873d142"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "8b6f7543774b3e8b"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle/results_2024-06-01T21-19-06.909857.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1600323.90425634,
9
- "end_time": 1600623.947537335,
10
- "total_evaluation_time_secondes": "300.04328099498525",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "894b60bbafd1de6b91cd582ad8015544ed8bf4ad",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.16,
20
- "qem_stderr": 0.052372293656638154
21
- },
22
- "all": {
23
- "qem": 0.16,
24
- "qem_stderr": 0.052372293656638154
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "fe6d63cab6bd11fa"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "1e410755dc9fd305"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_hard/results_2024-06-01T21-23-15.431062.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1415019.132282249,
9
- "end_time": 1415409.621590862,
10
- "total_evaluation_time_secondes": "390.48930861311965",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "894b60bbafd1de6b91cd582ad8015544ed8bf4ad",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999998
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "a45f6fe2abfcdc12"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "b47a38415b441e77"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.24/aimo_kaggle_medium/results_2024-06-01T21-21-15.404900.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 892991.386037297,
9
- "end_time": 893355.6378942,
10
- "total_evaluation_time_secondes": "364.2518569030799",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "894b60bbafd1de6b91cd582ad8015544ed8bf4ad",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.2,
20
- "qem_stderr": 0.06405126152203487
21
- },
22
- "all": {
23
- "qem": 0.2,
24
- "qem_stderr": 0.06405126152203487
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "3f7b84c283243aa2"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "1ff5a5fbaaccab2f"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle/results_2024-06-02T18-50-16.244304.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 276485.929851092,
9
- "end_time": 276724.204976521,
10
- "total_evaluation_time_secondes": "238.27512542903423",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "5803ef46ffcbaf9680dbe32c261bf2ce1f32f1c3",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.08,
20
- "qem_stderr": 0.03875617133214439
21
- },
22
- "all": {
23
- "qem": 0.08,
24
- "qem_stderr": 0.03875617133214439
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "3d1548c398e66753"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "2d552d90e907933d"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_hard/results_2024-06-02T18-55-06.099176.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 517062.738187587,
9
- "end_time": 517381.957801859,
10
- "total_evaluation_time_secondes": "319.21961427200586",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "5803ef46ffcbaf9680dbe32c261bf2ce1f32f1c3",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.01999999999999998
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.01999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "4ddc90bc9d802107"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "93760eb0e8c49d67"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.25/aimo_kaggle_medium/results_2024-06-02T18-53-27.832845.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 406197.396604711,
9
- "end_time": 406527.808383541,
10
- "total_evaluation_time_secondes": "330.4117788299918",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "5803ef46ffcbaf9680dbe32c261bf2ce1f32f1c3",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium:v0|0": {
19
- "qem": 0.1,
20
- "qem_stderr": 0.04803844614152612
21
- },
22
- "all": {
23
- "qem": 0.1,
24
- "qem_stderr": 0.04803844614152612
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium:v0": {
32
- "name": "aimo_kaggle_medium:v0",
33
- "prompt_function": "kaggle_medium_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium:v0|0": {
64
- "hashes": {
65
- "hash_examples": "3401efda8b0cbcb5",
66
- "hash_full_prompts": "14cb6646c78f810c",
67
- "hash_input_tokens": "d1d016040eb15856",
68
- "hash_cont_tokens": "47262795fd1ed671"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 32,
73
- "non_padded": 8,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "4c81d27cfdb9d737",
81
- "hash_full_prompts": "0d4787c840fc98b7",
82
- "hash_input_tokens": "7e52a8d7f3e15dcc",
83
- "hash_cont_tokens": "c12a9376b8d939d5"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 32,
88
- "non_padded": 8,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle/results_2024-06-01T21-39-31.404005.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1601532.508414207,
9
- "end_time": 1601848.441672974,
10
- "total_evaluation_time_secondes": "315.9332587670069",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "d4222e8c590d6c31fda1b4b63ed5dbe6a28a1f0c",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle:v0|0": {
19
- "qem": 0.16,
20
- "qem_stderr": 0.052372293656638154
21
- },
22
- "all": {
23
- "qem": 0.16,
24
- "qem_stderr": 0.052372293656638154
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle:v0": {
32
- "name": "aimo_kaggle:v0",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2a919e6b839e921a",
66
- "hash_full_prompts": "e0424ade9e31a0fe",
67
- "hash_input_tokens": "437c7d81b73a6e7a",
68
- "hash_cont_tokens": "c0a5c2dbd8d43de0"
69
- },
70
- "truncated": 47,
71
- "non_truncated": 3,
72
- "padded": 34,
73
- "non_padded": 16,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "f59ec6c4e24c1226",
81
- "hash_full_prompts": "35e1e25418ccb6f7",
82
- "hash_input_tokens": "c227167cd34ee1ce",
83
- "hash_cont_tokens": "ec9051d94c2aad79"
84
- },
85
- "truncated": 47,
86
- "non_truncated": 3,
87
- "padded": 34,
88
- "non_padded": 16,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/CodeLlama-13b-Python-hf-sft/aimo_v00.26/aimo_kaggle_hard/results_2024-06-01T21-43-22.900752.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 542430.398596583,
9
- "end_time": 542785.055968121,
10
- "total_evaluation_time_secondes": "354.65737153799273",
11
- "model_name": "AI-MO/CodeLlama-13b-Python-hf-sft",
12
- "model_sha": "d4222e8c590d6c31fda1b4b63ed5dbe6a28a1f0c",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "24.56 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.019999999999999976
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.019999999999999976
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard:v0": {
32
- "name": "aimo_kaggle_hard:v0",
33
- "prompt_function": "kaggle_hard_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard:v0|0": {
64
- "hashes": {
65
- "hash_examples": "b40b6a493a95bf77",
66
- "hash_full_prompts": "2c0f18269c13af34",
67
- "hash_input_tokens": "b477f3b96ee9e7ba",
68
- "hash_cont_tokens": "685d74fcb7bc0954"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 37,
73
- "non_padded": 13,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "79dbebcff6acad9e",
81
- "hash_full_prompts": "739e6bdec3671fc6",
82
- "hash_input_tokens": "16e2e8b5ca238d88",
83
- "hash_cont_tokens": "9ddec76b260758c3"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 37,
88
- "non_padded": 13,
89
- "num_truncated_few_shots": 0
90
- }
91
- }