diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_hard_mmos/results_2024-05-24T09-30-06.964431.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_hard_mmos/results_2024-05-24T09-30-06.964431.json deleted file mode 100644 index 17db3cb7ec54d3ee08133fbb5cd59e734c7443a5..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_hard_mmos/results_2024-05-24T09-30-06.964431.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 857446.223097524, - "end_time": 857817.902641332, - "total_evaluation_time_secondes": "371.6795438079862", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "e9f1f0034b3f35397651713c744779b27c7b0d2f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950606 - }, - "all": { - "qem": 0.04, - "qem_stderr": 0.027994168488950606 - } - }, - "versions": { - "custom|aimo_kaggle_hard_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_mmos:v0": { - "name": "aimo_kaggle_hard_mmos:v0", - "prompt_function": "mmos_hard_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "hashes": { - "hash_examples": "b40b6a493a95bf77", - "hash_full_prompts": "bf4ae1359925ab3b", - "hash_input_tokens": "5f11181c1839f9c3", - "hash_cont_tokens": "52b2b59de598fdd3" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "79dbebcff6acad9e", - "hash_full_prompts": "86a15c8410a494ef", - "hash_input_tokens": "28eff78f8cb1ac9f", - "hash_cont_tokens": "cc9bd188ea6976bc" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_hard_pot/results_2024-05-27T14-37-04.090977.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_hard_pot/results_2024-05-27T14-37-04.090977.json deleted file mode 100644 index 2e62166ed37f11e77aec7640fcfc2bc2dcda34d6..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_hard_pot/results_2024-05-27T14-37-04.090977.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1143715.699540001, - "end_time": 1144501.128154096, - "total_evaluation_time_secondes": "785.4286140948534", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "e9f1f0034b3f35397651713c744779b27c7b0d2f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.48, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.56, - "qem_stderr": 0.07091242083423345 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3666666666666667, - "qem_stderr": 0.058736839402488784 - }, - "all": { - "qem": 0.3666666666666667, - "qem_stderr": 0.058736839402488784 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "b73fa39a0f0c8082" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "1b2f354c5d65f92d" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "ddd3e2af9db6590b" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "a225423cb032047d" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_medium_mmos/results_2024-05-24T09-29-31.249987.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_medium_mmos/results_2024-05-24T09-29-31.249987.json deleted file mode 100644 index b9e94c9263c105d4ade0539c76fe56b8f50a4410..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_medium_mmos/results_2024-05-24T09-29-31.249987.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 727962.553320119, - "end_time": 728298.349595936, - "total_evaluation_time_secondes": "335.79627581697423", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "e9f1f0034b3f35397651713c744779b27c7b0d2f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "qem": 0.075, - "qem_stderr": 0.04217636961434869 - }, - "all": { - "qem": 0.075, - "qem_stderr": 0.04217636961434869 - } - }, - "versions": { - "custom|aimo_kaggle_medium_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_mmos:v0": { - "name": "aimo_kaggle_medium_mmos:v0", - "prompt_function": "mmos_medium_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "hashes": { - "hash_examples": "3401efda8b0cbcb5", - "hash_full_prompts": "9553e2c8c7c87406", - "hash_input_tokens": "4f9c9b13fc9c2736", - "hash_cont_tokens": "60bf7b05082d037a" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "4c81d27cfdb9d737", - "hash_full_prompts": "466bf00fd3e3d325", - "hash_input_tokens": "f3be8baebf6e5c0d", - "hash_cont_tokens": "572a44bb3cfcb49a" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_medium_pot/results_2024-05-27T14-31-39.435993.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_medium_pot/results_2024-05-27T14-31-39.435993.json deleted file mode 100644 index 4529f889c4cb68fbf0de67123f88f6e70a88089a..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.09/aimo_kaggle_medium_pot/results_2024-05-27T14-31-39.435993.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 438052.255329886, - "end_time": 438511.635606925, - "total_evaluation_time_secondes": "459.3802770390175", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "e9f1f0034b3f35397651713c744779b27c7b0d2f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.2, - "qem_stderr": 0.06405126152203487 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15833333333333335, - "qem_stderr": 0.05806195270674721 - }, - "all": { - "qem": 0.15833333333333335, - "qem_stderr": 0.05806195270674721 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "d3d0c2f5914eea85" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "70097c50fd00a4cd" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "a052161a23e608a2" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "480ba89e1c3b08f4" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_mmos/results_2024-05-24T09-38-01.891596.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_mmos/results_2024-05-24T09-38-01.891596.json deleted file mode 100644 index 554bdf9936fc236d58af8b992070dec370c998b9..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_mmos/results_2024-05-24T09-38-01.891596.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2115203.534425676, - "end_time": 2115616.753860881, - "total_evaluation_time_secondes": "413.21943520521745", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - }, - "all": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - } - }, - "versions": { - "custom|aimo_kaggle_hard_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_mmos:v0": { - "name": "aimo_kaggle_hard_mmos:v0", - "prompt_function": "mmos_hard_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "hashes": { - "hash_examples": "b40b6a493a95bf77", - "hash_full_prompts": "bf4ae1359925ab3b", - "hash_input_tokens": "5f11181c1839f9c3", - "hash_cont_tokens": "70ab15813457a0e7" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "79dbebcff6acad9e", - "hash_full_prompts": "86a15c8410a494ef", - "hash_input_tokens": "28eff78f8cb1ac9f", - "hash_cont_tokens": "932433668b9d10d1" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T12-10-10.985244.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T12-10-10.985244.json deleted file mode 100644 index 17dd8a267b529f6de0186038c2f605167988999c..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T12-10-10.985244.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 4049391.660600598, - "end_time": 4049852.921236317, - "total_evaluation_time_secondes": "461.26063571916893", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.08, - "qem_stderr": 0.038756171332144415 - }, - "all": { - "qem": 0.08, - "qem_stderr": 0.038756171332144415 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "69dabd3aaebed332", - "hash_cont_tokens": "38540998e511c7fa" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "cae48890a7c47904", - "hash_full_prompts": "6f120f8d8bb938e6", - "hash_input_tokens": "ff7094ff98f5f808", - "hash_cont_tokens": "d081df183e2788f4" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T12-38-20.637098.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T12-38-20.637098.json deleted file mode 100644 index a3808549f4784aa61963cbe695a78b243c44faf1..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T12-38-20.637098.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 9223358.910966022, - "end_time": 9223818.340137284, - "total_evaluation_time_secondes": "459.42917126230896", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.08, - "qem_stderr": 0.038756171332144415 - }, - "all": { - "qem": 0.08, - "qem_stderr": 0.038756171332144415 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "69dabd3aaebed332", - "hash_cont_tokens": "38540998e511c7fa" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "cae48890a7c47904", - "hash_full_prompts": "6f120f8d8bb938e6", - "hash_input_tokens": "ff7094ff98f5f808", - "hash_cont_tokens": "d081df183e2788f4" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T13-19-13.846655.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T13-19-13.846655.json deleted file mode 100644 index 8bfcd6afcccfaba2b2020c10be0a95fe8a31ed60..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_hard_pot/results_2024-05-27T13-19-13.846655.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2072215.893674265, - "end_time": 2073130.940811355, - "total_evaluation_time_secondes": "915.047137090005", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.48, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.58, - "qem_stderr": 0.07050835816716033 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3666666666666667, - "qem_stderr": 0.056624644117364224 - }, - "all": { - "qem": 0.3666666666666667, - "qem_stderr": 0.056624644117364224 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "1847a73ee97b84cf" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "07a33d50dbb61bc0" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "81be89d98a712a48" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "be0f2fadf1d23c89" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_mmos/results_2024-05-24T09-36-20.897469.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_mmos/results_2024-05-24T09-36-20.897469.json deleted file mode 100644 index b7a3b8de8e10eb4e0bd59f39091dc40cc1c37fec..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_mmos/results_2024-05-24T09-36-20.897469.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2117005.70480802, - "end_time": 2117317.858500111, - "total_evaluation_time_secondes": "312.15369209088385", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "all": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - } - }, - "versions": { - "custom|aimo_kaggle_medium_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_mmos:v0": { - "name": "aimo_kaggle_medium_mmos:v0", - "prompt_function": "mmos_medium_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "hashes": { - "hash_examples": "3401efda8b0cbcb5", - "hash_full_prompts": "9553e2c8c7c87406", - "hash_input_tokens": "4f9c9b13fc9c2736", - "hash_cont_tokens": "dcf1a76b60e91789" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "4c81d27cfdb9d737", - "hash_full_prompts": "466bf00fd3e3d325", - "hash_input_tokens": "f3be8baebf6e5c0d", - "hash_cont_tokens": "4e52fc07aed5d27d" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T12-08-02.630027.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T12-08-02.630027.json deleted file mode 100644 index f28c30b3e10aae436fdd5462aeb298859b115436..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T12-08-02.630027.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2108115.015177626, - "end_time": 2108455.320385662, - "total_evaluation_time_secondes": "340.3052080357447", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.06084343084444758 - }, - "all": { - "qem": 0.175, - "qem_stderr": 0.06084343084444758 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "bb5427e5227cc6b4", - "hash_cont_tokens": "d8fff7d9380f6289" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c72202b0a18ef5f9", - "hash_full_prompts": "a0429b0dea3e33db", - "hash_input_tokens": "8e6ef9684e80b71f", - "hash_cont_tokens": "4b3f922cf74d16d1" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T12-36-15.235885.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T12-36-15.235885.json deleted file mode 100644 index cfcb91c0493247adb03cec7eac2818478393be04..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T12-36-15.235885.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2106210.420319901, - "end_time": 2106551.181892837, - "total_evaluation_time_secondes": "340.7615729360841", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.06084343084444758 - }, - "all": { - "qem": 0.175, - "qem_stderr": 0.06084343084444758 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "bb5427e5227cc6b4", - "hash_cont_tokens": "d8fff7d9380f6289" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c72202b0a18ef5f9", - "hash_full_prompts": "a0429b0dea3e33db", - "hash_input_tokens": "8e6ef9684e80b71f", - "hash_cont_tokens": "4b3f922cf74d16d1" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T13-10-03.543903.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T13-10-03.543903.json deleted file mode 100644 index e277440c82a45f97c713be5509a02fd703b86132..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.10/aimo_kaggle_medium_pot/results_2024-05-27T13-10-03.543903.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 9225308.278480008, - "end_time": 9225721.246559735, - "total_evaluation_time_secondes": "412.96807972714305", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "61a9aad6e4c0fc64852cc86f9381183b734be0e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.225, - "qem_stderr": 0.06686668711812965 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.18333333333333332, - "qem_stderr": 0.06162910181742126 - }, - "all": { - "qem": 0.18333333333333332, - "qem_stderr": 0.06162910181742126 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "7bb95d2c217c1b5c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "6c0f57c20d743daf" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "8adfed93bcb46450" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "bd6abfe65da8351f" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_hard_mmos/results_2024-05-24T09-36-06.066222.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_hard_mmos/results_2024-05-24T09-36-06.066222.json deleted file mode 100644 index a892606e4e6cf1a22242e78f41c6203ccf565a16..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_hard_mmos/results_2024-05-24T09-36-06.066222.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1870349.343120157, - "end_time": 1870680.547112474, - "total_evaluation_time_secondes": "331.203992316965", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0b525f62d709e3bb0fed36d4055acadaeffb2fea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_hard_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_mmos:v0": { - "name": "aimo_kaggle_hard_mmos:v0", - "prompt_function": "mmos_hard_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "hashes": { - "hash_examples": "b40b6a493a95bf77", - "hash_full_prompts": "bf4ae1359925ab3b", - "hash_input_tokens": "5f11181c1839f9c3", - "hash_cont_tokens": "747646d721b689b3" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "79dbebcff6acad9e", - "hash_full_prompts": "86a15c8410a494ef", - "hash_input_tokens": "28eff78f8cb1ac9f", - "hash_cont_tokens": "6df83d700244dcf8" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_hard_pot/results_2024-05-27T14-35-43.671086.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_hard_pot/results_2024-05-27T14-35-43.671086.json deleted file mode 100644 index cc7df02418bcd327d922f97a528715b720c0ecec..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_hard_pot/results_2024-05-27T14-35-43.671086.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 342864.500085301, - "end_time": 343553.631159885, - "total_evaluation_time_secondes": "689.1310745839728", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0b525f62d709e3bb0fed36d4055acadaeffb2fea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.22, - "qem_stderr": 0.05917804336345136 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.25333333333333335, - "qem_stderr": 0.05286692776032447 - }, - "all": { - "qem": 0.25333333333333335, - "qem_stderr": 0.05286692776032447 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "954b24cc2bdc4e6c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "3a75a70b0409cc6d" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "277be9de6dc4234e" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "5297800052500dc2" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_medium_mmos/results_2024-05-24T09-36-32.170044.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_medium_mmos/results_2024-05-24T09-36-32.170044.json deleted file mode 100644 index 150155474d619756c56f50ec7eb0b3439df406af..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_medium_mmos/results_2024-05-24T09-36-32.170044.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2420257.809069017, - "end_time": 2420609.398094652, - "total_evaluation_time_secondes": "351.58902563527226", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0b525f62d709e3bb0fed36d4055acadaeffb2fea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_medium_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_mmos:v0": { - "name": "aimo_kaggle_medium_mmos:v0", - "prompt_function": "mmos_medium_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "hashes": { - "hash_examples": "3401efda8b0cbcb5", - "hash_full_prompts": "9553e2c8c7c87406", - "hash_input_tokens": "4f9c9b13fc9c2736", - "hash_cont_tokens": "e0e2d443b31f87e8" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "4c81d27cfdb9d737", - "hash_full_prompts": "466bf00fd3e3d325", - "hash_input_tokens": "f3be8baebf6e5c0d", - "hash_cont_tokens": "67c15a3d33fd85be" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_medium_pot/results_2024-05-27T14-31-11.406681.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_medium_pot/results_2024-05-27T14-31-11.406681.json deleted file mode 100644 index 51b41154d07aaddca68264e038d981b80a282778..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.11/aimo_kaggle_medium_pot/results_2024-05-27T14-31-11.406681.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 277940.525956996, - "end_time": 278355.570149596, - "total_evaluation_time_secondes": "415.0441926000058", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0b525f62d709e3bb0fed36d4055acadaeffb2fea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.10833333333333334, - "qem_stderr": 0.04967810046385748 - }, - "all": { - "qem": 0.10833333333333334, - "qem_stderr": 0.04967810046385748 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "8aca826f529dccef" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "2beb929a9d1f02a0" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "547712b40f6f642f" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "7494ce6ef42cbe4e" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_hard_mmos/results_2024-05-24T09-48-10.979290.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_hard_mmos/results_2024-05-24T09-48-10.979290.json deleted file mode 100644 index 709408a330d3d79056a05fbb4c36a5edbbbdb105..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_hard_mmos/results_2024-05-24T09-48-10.979290.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1871072.599385947, - "end_time": 1871405.4602378, - "total_evaluation_time_secondes": "332.86085185292177", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "feac1ee8323888504974e4c56f4dd29dec9f5eea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_hard_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_mmos:v0": { - "name": "aimo_kaggle_hard_mmos:v0", - "prompt_function": "mmos_hard_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "hashes": { - "hash_examples": "b40b6a493a95bf77", - "hash_full_prompts": "bf4ae1359925ab3b", - "hash_input_tokens": "5f11181c1839f9c3", - "hash_cont_tokens": "6431505c900c5db1" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "79dbebcff6acad9e", - "hash_full_prompts": "86a15c8410a494ef", - "hash_input_tokens": "28eff78f8cb1ac9f", - "hash_cont_tokens": "6b37585ba81ea148" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_hard_pot/results_2024-05-27T14-35-53.533059.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_hard_pot/results_2024-05-27T14-35-53.533059.json deleted file mode 100644 index cdbec4816cd8c08e72fc4c14394a3e528ae7fe8e..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_hard_pot/results_2024-05-27T14-35-53.533059.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 9230225.89268124, - "end_time": 9230871.207996167, - "total_evaluation_time_secondes": "645.3153149280697", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "feac1ee8323888504974e4c56f4dd29dec9f5eea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.02, - "qem_stderr": 0.02 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.06, - "qem_stderr": 0.033926691677251195 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.42, - "qem_stderr": 0.07050835816716035 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.16666666666666666, - "qem_stderr": 0.041478349948137185 - }, - "all": { - "qem": 0.16666666666666666, - "qem_stderr": 0.041478349948137185 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "dc39a3565f517db9" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "318061967e30b047" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "a2d575236416e3ca" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "9a548ef00ebb9273" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_medium_mmos/results_2024-05-24T09-48-12.415661.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_medium_mmos/results_2024-05-24T09-48-12.415661.json deleted file mode 100644 index 97163554e79f49b114fe069836185d445fc85d9f..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_medium_mmos/results_2024-05-24T09-48-12.415661.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2115893.229903704, - "end_time": 2116227.277975012, - "total_evaluation_time_secondes": "334.0480713080615", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "feac1ee8323888504974e4c56f4dd29dec9f5eea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_medium_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_mmos:v0": { - "name": "aimo_kaggle_medium_mmos:v0", - "prompt_function": "mmos_medium_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "hashes": { - "hash_examples": "3401efda8b0cbcb5", - "hash_full_prompts": "9553e2c8c7c87406", - "hash_input_tokens": "4f9c9b13fc9c2736", - "hash_cont_tokens": "22010b252d6c7aee" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "4c81d27cfdb9d737", - "hash_full_prompts": "466bf00fd3e3d325", - "hash_input_tokens": "f3be8baebf6e5c0d", - "hash_cont_tokens": "97d4e941c9cdda6b" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_medium_pot/results_2024-05-27T14-30-36.611690.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_medium_pot/results_2024-05-27T14-30-36.611690.json deleted file mode 100644 index 80a6773b6304c41b44e4490fa0eb4411301462b3..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.12/aimo_kaggle_medium_pot/results_2024-05-27T14-30-36.611690.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 4057927.003502535, - "end_time": 4058278.547280695, - "total_evaluation_time_secondes": "351.54377815965563", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "feac1ee8323888504974e4c56f4dd29dec9f5eea", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.025, - "qem_stderr": 0.024999999999999998 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.05833333333333333, - "qem_stderr": 0.027392395829895517 - }, - "all": { - "qem": 0.05833333333333333, - "qem_stderr": 0.027392395829895517 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "dea1ea4cd88ac25c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "c30f828e6479e03f" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "4631fc712773916c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "f61db1e864829049" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.13/aimo_kaggle_hard_mmos/results_2024-05-24T10-07-40.764237.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.13/aimo_kaggle_hard_mmos/results_2024-05-24T10-07-40.764237.json deleted file mode 100644 index 3b438081e3118940850474673d0f539f3829143d..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.13/aimo_kaggle_hard_mmos/results_2024-05-24T10-07-40.764237.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1060.262766582, - "end_time": 1408.920575274, - "total_evaluation_time_secondes": "348.6578086919999", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f2387bf0ced9308beb238e535b4d8334e7487ab2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_hard_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_mmos:v0": { - "name": "aimo_kaggle_hard_mmos:v0", - "prompt_function": "mmos_hard_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "hashes": { - "hash_examples": "b40b6a493a95bf77", - "hash_full_prompts": "bf4ae1359925ab3b", - "hash_input_tokens": "5f11181c1839f9c3", - "hash_cont_tokens": "4f6b08399af03ffd" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "79dbebcff6acad9e", - "hash_full_prompts": "86a15c8410a494ef", - "hash_input_tokens": "28eff78f8cb1ac9f", - "hash_cont_tokens": "575c9bad78c37155" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.13/aimo_kaggle_medium_mmos/results_2024-05-24T10-07-11.135359.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.13/aimo_kaggle_medium_mmos/results_2024-05-24T10-07-11.135359.json deleted file mode 100644 index 2d2bd27186cf5ba5d69d1516d72f7e53095f87db..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.13/aimo_kaggle_medium_mmos/results_2024-05-24T10-07-11.135359.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1251998.579556463, - "end_time": 1252318.380801528, - "total_evaluation_time_secondes": "319.8012450649403", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f2387bf0ced9308beb238e535b4d8334e7487ab2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_medium_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_mmos:v0": { - "name": "aimo_kaggle_medium_mmos:v0", - "prompt_function": "mmos_medium_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "hashes": { - "hash_examples": "3401efda8b0cbcb5", - "hash_full_prompts": "9553e2c8c7c87406", - "hash_input_tokens": "4f9c9b13fc9c2736", - "hash_cont_tokens": "cb917267759cd0c9" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "4c81d27cfdb9d737", - "hash_full_prompts": "466bf00fd3e3d325", - "hash_input_tokens": "f3be8baebf6e5c0d", - "hash_cont_tokens": "b39be9d73f59030c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.14/aimo_kaggle_hard_mmos/results_2024-05-24T10-16-49.966984.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.14/aimo_kaggle_hard_mmos/results_2024-05-24T10-16-49.966984.json deleted file mode 100644 index fc3d2924146bb4b4583dba70ec23d45599c51f90..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.14/aimo_kaggle_hard_mmos/results_2024-05-24T10-16-49.966984.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1579.416964497, - "end_time": 1958.123144323, - "total_evaluation_time_secondes": "378.7061798259999", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "868636f68df95b6db35b4113753003b88836a86a", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "qem": 0.02, - "qem_stderr": 0.02 - }, - "all": { - "qem": 0.02, - "qem_stderr": 0.02 - } - }, - "versions": { - "custom|aimo_kaggle_hard_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_mmos:v0": { - "name": "aimo_kaggle_hard_mmos:v0", - "prompt_function": "mmos_hard_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "hashes": { - "hash_examples": "b40b6a493a95bf77", - "hash_full_prompts": "bf4ae1359925ab3b", - "hash_input_tokens": "5f11181c1839f9c3", - "hash_cont_tokens": "0592c8e0d76ec685" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "79dbebcff6acad9e", - "hash_full_prompts": "86a15c8410a494ef", - "hash_input_tokens": "28eff78f8cb1ac9f", - "hash_cont_tokens": "6cc5093a5dfe5f31" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.14/aimo_kaggle_medium_mmos/results_2024-05-24T10-16-38.488727.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.14/aimo_kaggle_medium_mmos/results_2024-05-24T10-16-38.488727.json deleted file mode 100644 index 2f423f87962edb468d243af6ef97fa05cd7a82c8..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.14/aimo_kaggle_medium_mmos/results_2024-05-24T10-16-38.488727.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1252518.683488359, - "end_time": 1252885.734003984, - "total_evaluation_time_secondes": "367.05051562492736", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "868636f68df95b6db35b4113753003b88836a86a", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "qem": 0.075, - "qem_stderr": 0.042176369614348695 - }, - "all": { - "qem": 0.075, - "qem_stderr": 0.042176369614348695 - } - }, - "versions": { - "custom|aimo_kaggle_medium_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_mmos:v0": { - "name": "aimo_kaggle_medium_mmos:v0", - "prompt_function": "mmos_medium_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "hashes": { - "hash_examples": "3401efda8b0cbcb5", - "hash_full_prompts": "9553e2c8c7c87406", - "hash_input_tokens": "4f9c9b13fc9c2736", - "hash_cont_tokens": "5bd7b16ad1cce5f7" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "4c81d27cfdb9d737", - "hash_full_prompts": "466bf00fd3e3d325", - "hash_input_tokens": "f3be8baebf6e5c0d", - "hash_cont_tokens": "32fd13771d23980f" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.15/aimo_kaggle_hard_mmos/results_2024-05-24T10-25-09.540279.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.15/aimo_kaggle_hard_mmos/results_2024-05-24T10-25-09.540279.json deleted file mode 100644 index b59ce54db6f9b0eeef8c11c50ada0a3179150bf9..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.15/aimo_kaggle_hard_mmos/results_2024-05-24T10-25-09.540279.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 466.001152748, - "end_time": 788.580502107, - "total_evaluation_time_secondes": "322.57934935900005", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "c599676fdc00a821d271b74201c28a69b8dae2e7", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_hard_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_mmos:v0": { - "name": "aimo_kaggle_hard_mmos:v0", - "prompt_function": "mmos_hard_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_mmos:v0|0": { - "hashes": { - "hash_examples": "b40b6a493a95bf77", - "hash_full_prompts": "bf4ae1359925ab3b", - "hash_input_tokens": "5f11181c1839f9c3", - "hash_cont_tokens": "d220e4e743de8a9a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "79dbebcff6acad9e", - "hash_full_prompts": "86a15c8410a494ef", - "hash_input_tokens": "28eff78f8cb1ac9f", - "hash_cont_tokens": "426a138c8b85ec49" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.15/aimo_kaggle_medium_mmos/results_2024-05-24T10-25-14.707607.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.15/aimo_kaggle_medium_mmos/results_2024-05-24T10-25-14.707607.json deleted file mode 100644 index cb57fb73ef8a28fc0b8ef493ad37dacdba860ed6..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v01.15/aimo_kaggle_medium_mmos/results_2024-05-24T10-25-14.707607.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2423204.564062136, - "end_time": 2423531.935635801, - "total_evaluation_time_secondes": "327.37157366471365", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "c599676fdc00a821d271b74201c28a69b8dae2e7", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "all": { - "qem": 0.0, - "qem_stderr": 0.0 - } - }, - "versions": { - "custom|aimo_kaggle_medium_mmos:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_mmos:v0": { - "name": "aimo_kaggle_medium_mmos:v0", - "prompt_function": "mmos_medium_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_mmos:v0|0": { - "hashes": { - "hash_examples": "3401efda8b0cbcb5", - "hash_full_prompts": "9553e2c8c7c87406", - "hash_input_tokens": "4f9c9b13fc9c2736", - "hash_cont_tokens": "b3bfc1c6165ee81c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "4c81d27cfdb9d737", - "hash_full_prompts": "466bf00fd3e3d325", - "hash_input_tokens": "f3be8baebf6e5c0d", - "hash_cont_tokens": "6e3f84e0c25bbd6c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.01/aimo_kaggle_hard_pot/results_2024-05-27T22-52-32.480755.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.01/aimo_kaggle_hard_pot/results_2024-05-27T22-52-32.480755.json deleted file mode 100644 index 2379401910d22a1d8f2a85cd3ba89ca83f5ecc5e..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.01/aimo_kaggle_hard_pot/results_2024-05-27T22-52-32.480755.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 10719.748736187, - "end_time": 11433.49436362, - "total_evaluation_time_secondes": "713.7456274329998", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "ab83077ac83ffbbb8abe282b3dd8f4228cbbdcba", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.38, - "qem_stderr": 0.06934092056863767 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3133333333333333, - "qem_stderr": 0.0582320612248201 - }, - "all": { - "qem": 0.3133333333333333, - "qem_stderr": 0.0582320612248201 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "c2831c3f8adfd2a5" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "e37a489b906d968e" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "43a13ab9e5e2cd6c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "9ca954caaded26b5" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.01/aimo_kaggle_medium_pot/results_2024-05-27T22-49-32.235858.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.01/aimo_kaggle_medium_pot/results_2024-05-27T22-49-32.235858.json deleted file mode 100644 index 6d1f6e9368084b482a32bb8143955a8d6863b162..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.01/aimo_kaggle_medium_pot/results_2024-05-27T22-49-32.235858.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 41425.043779027, - "end_time": 41783.252532695, - "total_evaluation_time_secondes": "358.20875366799737", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "ab83077ac83ffbbb8abe282b3dd8f4228cbbdcba", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.225, - "qem_stderr": 0.06686668711812965 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.2, - "qem_stderr": 0.06405126152203487 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.19166666666666665, - "qem_stderr": 0.06269837870995036 - }, - "all": { - "qem": 0.19166666666666665, - "qem_stderr": 0.06269837870995036 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "129f73f798992c2d" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "c213703222b7b9d6" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "710c61a10724eb58" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "46841f1d968bb396" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.02/aimo_kaggle_hard_pot/results_2024-05-27T23-01-26.617019.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.02/aimo_kaggle_hard_pot/results_2024-05-27T23-01-26.617019.json deleted file mode 100644 index 1c302dda1646f244cbf8fbbd815c66ab543d6ce4..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.02/aimo_kaggle_hard_pot/results_2024-05-27T23-01-26.617019.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 9260481.786180781, - "end_time": 9261204.31955451, - "total_evaluation_time_secondes": "722.5333737283945", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "c579155dded92356530d24203be65dd06b745a34", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.36, - "qem_stderr": 0.06857142857142856 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.30666666666666664, - "qem_stderr": 0.057975563892417065 - }, - "all": { - "qem": 0.30666666666666664, - "qem_stderr": 0.057975563892417065 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "9aabf9d4ff47b254" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "9bf8175c55528051" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "3f123f6b38b6ffd5" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "afa8c82042b153d3" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.02/aimo_kaggle_medium_pot/results_2024-05-27T22-56-40.113739.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.02/aimo_kaggle_medium_pot/results_2024-05-27T22-56-40.113739.json deleted file mode 100644 index 272c9923e8517d698e33502e0d2809d7ec731ab7..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.02/aimo_kaggle_medium_pot/results_2024-05-27T22-56-40.113739.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2107342.128751614, - "end_time": 2107777.207887237, - "total_evaluation_time_secondes": "435.0791356228292", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "c579155dded92356530d24203be65dd06b745a34", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "all": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "9ed765ebd58d3013" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "2f085c109bc10f7d" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "1aedc4a3ff7a75a1" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "70d3921740f7ab02" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.03/aimo_kaggle_hard_pot/results_2024-05-27T22-50-33.482970.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.03/aimo_kaggle_hard_pot/results_2024-05-27T22-50-33.482970.json deleted file mode 100644 index ff5838f9494cf993a11c24bd7e8db5db16036ee7..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.03/aimo_kaggle_hard_pot/results_2024-05-27T22-50-33.482970.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 14259.254878757, - "end_time": 14914.427603002, - "total_evaluation_time_secondes": "655.1727242449997", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "8ccdf6776522c8613835892a71912fe2d8366885", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.12, - "qem_stderr": 0.04642307659791979 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.54, - "qem_stderr": 0.07119963311072637 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.23333333333333336, - "qem_stderr": 0.048538959399198923 - }, - "all": { - "qem": 0.23333333333333336, - "qem_stderr": 0.048538959399198923 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "8fa9e18efe49021a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "c0e55e5137310b5c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "a1da9b9deee659aa" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "75cb29ad60e745ed" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.03/aimo_kaggle_medium_pot/results_2024-05-27T22-46-20.885308.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.03/aimo_kaggle_medium_pot/results_2024-05-27T22-46-20.885308.json deleted file mode 100644 index 919292d5fb5137880def26f6de7c60501f34a7a0..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.03/aimo_kaggle_medium_pot/results_2024-05-27T22-46-20.885308.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 106996.846626916, - "end_time": 107359.816964207, - "total_evaluation_time_secondes": "362.97033729100076", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "8ccdf6776522c8613835892a71912fe2d8366885", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.025, - "qem_stderr": 0.024999999999999994 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.05, - "qem_stderr": 0.03489912202260563 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.06666666666666667, - "qem_stderr": 0.03761884371037528 - }, - "all": { - "qem": 0.06666666666666667, - "qem_stderr": 0.03761884371037528 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "398c838fac2a32cb" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "ad46c87b88dbba2f" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "246adbd385dd4579" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "2f97b0cecd2c9ec6" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.04/aimo_kaggle_hard_pot/results_2024-05-27T23-02-16.395548.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.04/aimo_kaggle_hard_pot/results_2024-05-27T23-02-16.395548.json deleted file mode 100644 index dc8c1147d291f7b161e8dcb39d7d6b8643e287bc..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.04/aimo_kaggle_hard_pot/results_2024-05-27T23-02-16.395548.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 304641.79705412, - "end_time": 305415.434942523, - "total_evaluation_time_secondes": "773.6378884030273", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "9d352e2a08647c995abd486f03c103c328f94b5e", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.08, - "qem_stderr": 0.03875617133214439 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.44, - "qem_stderr": 0.07091242083423345 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.38, - "qem_stderr": 0.06934092056863767 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3, - "qem_stderr": 0.0596698375783385 - }, - "all": { - "qem": 0.3, - "qem_stderr": 0.0596698375783385 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "566a96752b9c4a94" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "7ab56adc56fc6b33" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "5c601ac7277feacc" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "3f013a410775284d" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.04/aimo_kaggle_medium_pot/results_2024-05-27T22-56-39.512900.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.04/aimo_kaggle_medium_pot/results_2024-05-27T22-56-39.512900.json deleted file mode 100644 index c01d192f6600746ca40101ae88231ef210bab7df..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.04/aimo_kaggle_medium_pot/results_2024-05-27T22-56-39.512900.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 18377.382677895, - "end_time": 18811.76337702, - "total_evaluation_time_secondes": "434.38069912500214", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "9d352e2a08647c995abd486f03c103c328f94b5e", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15833333333333333, - "qem_stderr": 0.05839926860794023 - }, - "all": { - "qem": 0.15833333333333333, - "qem_stderr": 0.05839926860794023 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "c0bcec869e50357e" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "175d7969de36ab1b" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "f3c3ab633ac76bd0" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "ed6a5a9ce37315ba" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.05/aimo_kaggle_hard_pot/results_2024-05-27T22-47-26.728707.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.05/aimo_kaggle_hard_pot/results_2024-05-27T22-47-26.728707.json deleted file mode 100644 index 7e3e4f1c9b2d080b0c68c0223faefedd3cdb1d78..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.05/aimo_kaggle_hard_pot/results_2024-05-27T22-47-26.728707.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 211019.661163806, - "end_time": 211758.414953224, - "total_evaluation_time_secondes": "738.7537894179986", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "1b1c05ec5ff47d465eaaa801c306a11637d140a4", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.36, - "qem_stderr": 0.06857142857142856 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.6, - "qem_stderr": 0.06998542122237653 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3333333333333333, - "qem_stderr": 0.0555170060942519 - }, - "all": { - "qem": 0.3333333333333333, - "qem_stderr": 0.0555170060942519 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "6756ac049a892978" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "dfdc87594f75c999" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "63fc4eabc1edb20f" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "5cc4dd7edcddd58a" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.05/aimo_kaggle_medium_pot/results_2024-05-27T22-42-48.691321.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.05/aimo_kaggle_medium_pot/results_2024-05-27T22-42-48.691321.json deleted file mode 100644 index 12b897e24aafdc785de24d7cf18de427e85bfe1e..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.05/aimo_kaggle_medium_pot/results_2024-05-27T22-42-48.691321.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 40984.915370342, - "end_time": 41379.707945993, - "total_evaluation_time_secondes": "394.79257565100124", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "1b1c05ec5ff47d465eaaa801c306a11637d140a4", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.225, - "qem_stderr": 0.06686668711812965 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.2, - "qem_stderr": 0.06405126152203486 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.20000000000000004, - "qem_stderr": 0.06392045982820403 - }, - "all": { - "qem": 0.20000000000000004, - "qem_stderr": 0.06392045982820403 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "a2da112f28b7b3dd" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "86ee58a3785c67b7" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "f4b14a0f9c0626a6" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "5b986fd2338ac693" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.06/aimo_kaggle_hard_pot/results_2024-05-27T23-22-45.715568.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.06/aimo_kaggle_hard_pot/results_2024-05-27T23-22-45.715568.json deleted file mode 100644 index 272f2831aee644d5a4a8f7cb23f773c2b11c16f7..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.06/aimo_kaggle_hard_pot/results_2024-05-27T23-22-45.715568.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2382128.191168791, - "end_time": 2382914.608837258, - "total_evaluation_time_secondes": "786.4176684669219", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "86c0089055d54cd0922ae0d3b6a0455e83fe19a1", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.36, - "qem_stderr": 0.06857142857142856 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3, - "qem_stderr": 0.055998056162983534 - }, - "all": { - "qem": 0.3, - "qem_stderr": 0.055998056162983534 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "96626341015b155a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "944cb6439171b0d6" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "e20041a1a89e4258" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "81eae61419effeda" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.06/aimo_kaggle_medium_pot/results_2024-05-27T23-16-30.855912.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.06/aimo_kaggle_medium_pot/results_2024-05-27T23-16-30.855912.json deleted file mode 100644 index fcc0ba97fd14d055055c4ced52f673b741f4bf0e..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.06/aimo_kaggle_medium_pot/results_2024-05-27T23-16-30.855912.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 44794.384758256, - "end_time": 45210.722487146, - "total_evaluation_time_secondes": "416.337728890001", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "86c0089055d54cd0922ae0d3b6a0455e83fe19a1", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447564 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15833333333333333, - "qem_stderr": 0.058214756932471794 - }, - "all": { - "qem": 0.15833333333333333, - "qem_stderr": 0.058214756932471794 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "f64111597c7c5f38" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "384432275bdfee30" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "5cd657852f93ead5" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "1d9394689bab0335" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.07/aimo_kaggle_hard_pot/results_2024-05-27T22-43-29.392522.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.07/aimo_kaggle_hard_pot/results_2024-05-27T22-43-29.392522.json deleted file mode 100644 index 3323bdbd94ba96fa3164ce1bd26ed15e6c478fa0..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.07/aimo_kaggle_hard_pot/results_2024-05-27T22-43-29.392522.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 15640.261235424, - "end_time": 16281.29244475, - "total_evaluation_time_secondes": "641.0312093260018", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "d572ef7608311c43ad723a3ac133e4503e7303a8", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.02, - "qem_stderr": 0.02 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.22, - "qem_stderr": 0.05917804336345138 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.56, - "qem_stderr": 0.07091242083423345 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.26666666666666666, - "qem_stderr": 0.050030154732561616 - }, - "all": { - "qem": 0.26666666666666666, - "qem_stderr": 0.050030154732561616 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "e8d46b691518bf31" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "96ef4d972ae45fd3" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "de4121a908cc86cf" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "974e7c9b71028589" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.07/aimo_kaggle_medium_pot/results_2024-05-27T22-39-34.119815.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.07/aimo_kaggle_medium_pot/results_2024-05-27T22-39-34.119815.json deleted file mode 100644 index 32186650463877fba5a2d4cb274bbf6ee0d1755d..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.07/aimo_kaggle_medium_pot/results_2024-05-27T22-39-34.119815.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 106589.039780829, - "end_time": 106953.051437736, - "total_evaluation_time_secondes": "364.0116569069942", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "d572ef7608311c43ad723a3ac133e4503e7303a8", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.075, - "qem_stderr": 0.04217636961434867 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.05833333333333333, - "qem_stderr": 0.03007160525195826 - }, - "all": { - "qem": 0.05833333333333333, - "qem_stderr": 0.03007160525195826 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "af7a2a7c3bf735ce" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "3492201abfa537d3" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "b23881bd7634dfdf" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "37a711876b56979b" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.08/aimo_kaggle_hard_pot/results_2024-05-27T22-45-02.958180.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.08/aimo_kaggle_hard_pot/results_2024-05-27T22-45-02.958180.json deleted file mode 100644 index 23556a2bf5721667558f00fb9d86974c1e7d37f6..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.08/aimo_kaggle_hard_pot/results_2024-05-27T22-45-02.958180.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 3181770.247112947, - "end_time": 3182453.504957875, - "total_evaluation_time_secondes": "683.2578449277207", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "dc1193beda83f0cca3b7e60a158c36f6804af333", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950606 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04285714285714285 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.34, - "qem_stderr": 0.0676726816132972 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.16, - "qem_stderr": 0.04617466431979689 - }, - "all": { - "qem": 0.16, - "qem_stderr": 0.04617466431979689 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "aec6eb9abe870d9c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "f5a8786338ef07ef" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "19cc51ae4f8a174c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "0bced5d39b0648e2" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.08/aimo_kaggle_medium_pot/results_2024-05-27T22-39-55.232091.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.08/aimo_kaggle_medium_pot/results_2024-05-27T22-39-55.232091.json deleted file mode 100644 index 5cb61ba3d46e813abc529fa46541840ecc8995e0..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.08/aimo_kaggle_medium_pot/results_2024-05-27T22-39-55.232091.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 10322.250336088, - "end_time": 10676.245850647, - "total_evaluation_time_secondes": "353.9955145589993", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "dc1193beda83f0cca3b7e60a158c36f6804af333", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.1, - "qem_stderr": 0.048038446141526144 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.03333333333333333, - "qem_stderr": 0.016012815380508715 - }, - "all": { - "qem": 0.03333333333333333, - "qem_stderr": 0.016012815380508715 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "f6ef25c13e3b9beb" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "d2e29a088eb3ef40" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "05fe807135396e54" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "42dc377e96ff07f4" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.09/aimo_kaggle_hard_pot/results_2024-05-27T23-01-23.916419.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.09/aimo_kaggle_hard_pot/results_2024-05-27T23-01-23.916419.json deleted file mode 100644 index 166851935007b266ac5d3f79f45f83d01edc0c5c..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.09/aimo_kaggle_hard_pot/results_2024-05-27T23-01-23.916419.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 45437.752892193, - "end_time": 46158.771446592, - "total_evaluation_time_secondes": "721.018554399001", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "4c5a6b0364eeff00e757545ad918edf9938087d7", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.38, - "qem_stderr": 0.06934092056863767 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.30666666666666664, - "qem_stderr": 0.05625455349538657 - }, - "all": { - "qem": 0.30666666666666664, - "qem_stderr": 0.05625455349538657 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "ee8d8deb951e1d20" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "4873c8bc9c937aea" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "0f9b2a78a3b11cef" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "28102952776d50f9" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.09/aimo_kaggle_medium_pot/results_2024-05-27T22-56-49.561125.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.09/aimo_kaggle_medium_pot/results_2024-05-27T22-56-49.561125.json deleted file mode 100644 index 596445d5c4d2d2bb097f8204c0973f11b29dcd71..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.09/aimo_kaggle_medium_pot/results_2024-05-27T22-56-49.561125.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 43584.407888815, - "end_time": 44029.427719081, - "total_evaluation_time_secondes": "445.0198302659992", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "4c5a6b0364eeff00e757545ad918edf9938087d7", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.10833333333333334, - "qem_stderr": 0.04967810046385748 - }, - "all": { - "qem": 0.10833333333333334, - "qem_stderr": 0.04967810046385748 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "fe53ab7e7343a382" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "afec954f4fad91f4" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "47fe73a6dd818fe2" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "8efd6f1fe2208bcb" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.10/aimo_kaggle_hard_pot/results_2024-05-27T22-56-26.205133.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.10/aimo_kaggle_hard_pot/results_2024-05-27T22-56-26.205133.json deleted file mode 100644 index 0275826d059f8d6bd2c2172e651f40c710073e8f..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.10/aimo_kaggle_hard_pot/results_2024-05-27T22-56-26.205133.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 16333.701216751, - "end_time": 17058.105147357, - "total_evaluation_time_secondes": "724.4039306060004", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "18af21a11f19e82ae0b5c4807f6061cb2eb1458b", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.44, - "qem_stderr": 0.07091242083423345 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.6, - "qem_stderr": 0.06998542122237653 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3666666666666667, - "qem_stderr": 0.058274844577953726 - }, - "all": { - "qem": 0.3666666666666667, - "qem_stderr": 0.058274844577953726 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "28667310ce047385" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "20523b0a9736ca9a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "fa8e50486f1f0438" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "f27ba9fc232dd906" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.10/aimo_kaggle_medium_pot/results_2024-05-27T22-52-23.993586.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.10/aimo_kaggle_medium_pot/results_2024-05-27T22-52-23.993586.json deleted file mode 100644 index 86913cfe3cdc6150b250722e69a34050cfd7a726..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.10/aimo_kaggle_medium_pot/results_2024-05-27T22-52-23.993586.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 3182508.308928725, - "end_time": 3182894.568337251, - "total_evaluation_time_secondes": "386.25940852612257", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "18af21a11f19e82ae0b5c4807f6061cb2eb1458b", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.25, - "qem_stderr": 0.06933752452815363 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.19166666666666665, - "qem_stderr": 0.062452714287429247 - }, - "all": { - "qem": 0.19166666666666665, - "qem_stderr": 0.062452714287429247 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "33e1217843971331" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "0317f6719f9f3cd1" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "847c8e8d9dcacdd4" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "4d3a9d65f57acd85" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.11/aimo_kaggle_hard_pot/results_2024-05-27T23-28-10.769733.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.11/aimo_kaggle_hard_pot/results_2024-05-27T23-28-10.769733.json deleted file mode 100644 index 9aa0ca26c770c46761b9c674864611df1b0c70b5..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.11/aimo_kaggle_hard_pot/results_2024-05-27T23-28-10.769733.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 109144.315993371, - "end_time": 109869.701287677, - "total_evaluation_time_secondes": "725.3852943060047", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "5fbd6d2b74f2755ce0b32890432d6af62410c3e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.12, - "qem_stderr": 0.046423076597919784 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.34, - "qem_stderr": 0.0676726816132972 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.52, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.32666666666666666, - "qem_stderr": 0.06182238796906623 - }, - "all": { - "qem": 0.32666666666666666, - "qem_stderr": 0.06182238796906623 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "e730032197dfa045" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "7b27c9de11d4cbd5" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "dc5b74898fd59f58" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "b759b18b73f3f7fe" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.11/aimo_kaggle_medium_pot/results_2024-05-27T23-24-01.746924.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.11/aimo_kaggle_medium_pot/results_2024-05-27T23-24-01.746924.json deleted file mode 100644 index 3e438b57b9c7b55a7630ccdd37fd30b15255f052..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.11/aimo_kaggle_medium_pot/results_2024-05-27T23-24-01.746924.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 45275.122268848, - "end_time": 45661.613526662, - "total_evaluation_time_secondes": "386.49125781399925", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "5fbd6d2b74f2755ce0b32890432d6af62410c3e2", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.225, - "qem_stderr": 0.06686668711812965 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15833333333333333, - "qem_stderr": 0.057593835111723356 - }, - "all": { - "qem": 0.15833333333333333, - "qem_stderr": 0.057593835111723356 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "fa3b910dfca55474" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "b1ae92916dde27d0" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "2febceafe1d903ce" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "4e6bfc37d6e77d6a" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.12/aimo_kaggle_hard_pot/results_2024-05-27T22-58-24.076261.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.12/aimo_kaggle_hard_pot/results_2024-05-27T22-58-24.076261.json deleted file mode 100644 index b59ef0203f57892a5a93d737276861c0dc753236..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.12/aimo_kaggle_hard_pot/results_2024-05-27T22-58-24.076261.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 107404.931245741, - "end_time": 108083.007317716, - "total_evaluation_time_secondes": "678.076071974996", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "651246d76fe6c8aedf46454ea474eca76136ab73", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.3, - "qem_stderr": 0.06546536707079771 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.52, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.2866666666666667, - "qem_stderr": 0.05494364708524335 - }, - "all": { - "qem": 0.2866666666666667, - "qem_stderr": 0.05494364708524335 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "713e9ea3fd7a21cd" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "a24d886da74b19a1" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "81abaeff7a78b548" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "417fec7e4d72917f" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.12/aimo_kaggle_medium_pot/results_2024-05-27T22-54-41.166514.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.12/aimo_kaggle_medium_pot/results_2024-05-27T22-54-41.166514.json deleted file mode 100644 index 3edac5a99a11c92d75aa0fff1c4079068da62832..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.12/aimo_kaggle_medium_pot/results_2024-05-27T22-54-41.166514.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 211814.762291854, - "end_time": 212192.853241965, - "total_evaluation_time_secondes": "378.09095011101454", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "651246d76fe6c8aedf46454ea474eca76136ab73", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.10833333333333334, - "qem_stderr": 0.04967810046385749 - }, - "all": { - "qem": 0.10833333333333334, - "qem_stderr": 0.04967810046385749 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "dfc8b121255332df" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "5530d0bf94ec3456" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "e03d999936114bfc" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "45672d8b1cdafda4" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.13/aimo_kaggle_hard_pot/results_2024-05-27T23-01-44.687213.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.13/aimo_kaggle_hard_pot/results_2024-05-27T23-01-44.687213.json deleted file mode 100644 index 077875b9e4fdafc9125eebb140f2983c6477339a..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.13/aimo_kaggle_hard_pot/results_2024-05-27T23-01-44.687213.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 308248.14863674, - "end_time": 308988.850497727, - "total_evaluation_time_secondes": "740.7018609869992", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "2e9b66be4cc3f849b088081dd00196b761169277", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.02, - "qem_stderr": 0.02 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.22, - "qem_stderr": 0.05917804336345138 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.52, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.25333333333333335, - "qem_stderr": 0.0501831496864777 - }, - "all": { - "qem": 0.25333333333333335, - "qem_stderr": 0.0501831496864777 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "7e4010daf0d6af1e" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "92900adecd48bcf6" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "5110e8c8f508f46a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "f7bdebf0bad6a1d0" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.13/aimo_kaggle_medium_pot/results_2024-05-27T22-56-25.627286.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.13/aimo_kaggle_medium_pot/results_2024-05-27T22-56-25.627286.json deleted file mode 100644 index e05f918e21f2cfa0854a7d89af3588f4db05ef84..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.13/aimo_kaggle_medium_pot/results_2024-05-27T22-56-25.627286.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 369444.480286805, - "end_time": 369867.494755725, - "total_evaluation_time_secondes": "423.0144689200097", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "2e9b66be4cc3f849b088081dd00196b761169277", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.025, - "qem_stderr": 0.024999999999999994 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.09999999999999999, - "qem_stderr": 0.04504486553273559 - }, - "all": { - "qem": 0.09999999999999999, - "qem_stderr": 0.04504486553273559 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "b2e62e7bd35a9461" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "3361b91815fed232" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "4b47324fc9a1ac0f" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "67498af105fab7c9" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.14/aimo_kaggle_hard_pot/results_2024-05-27T23-09-54.472438.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.14/aimo_kaggle_hard_pot/results_2024-05-27T23-09-54.472438.json deleted file mode 100644 index 9c06fb1fe10b20bea947b76dd382d56095589145..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.14/aimo_kaggle_hard_pot/results_2024-05-27T23-09-54.472438.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 369935.952147088, - "end_time": 370676.339592748, - "total_evaluation_time_secondes": "740.3874456599588", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "139b2c72994e2c58df6e13f9447687e11abf837e", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.3, - "qem_stderr": 0.06546536707079771 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.42, - "qem_stderr": 0.07050835816716035 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.25333333333333335, - "qem_stderr": 0.05465596457563623 - }, - "all": { - "qem": 0.25333333333333335, - "qem_stderr": 0.05465596457563623 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "5f278aeb63a41729" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "c885b5a4725f0230" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "78caebbf757f8038" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "7b4ed7724f76adc1" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.14/aimo_kaggle_medium_pot/results_2024-05-27T23-04-49.392793.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.14/aimo_kaggle_medium_pot/results_2024-05-27T23-04-49.392793.json deleted file mode 100644 index 01c7f8fa3272e9a4537a211202203502329eae5d..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.14/aimo_kaggle_medium_pot/results_2024-05-27T23-04-49.392793.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 18866.508172838, - "end_time": 19301.643280139, - "total_evaluation_time_secondes": "435.13510730100097", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "139b2c72994e2c58df6e13f9447687e11abf837e", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.25, - "qem_stderr": 0.06933752452815363 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.16666666666666666, - "qem_stderr": 0.05841744758173135 - }, - "all": { - "qem": 0.16666666666666666, - "qem_stderr": 0.05841744758173135 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "e110252bb4f0f166" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "74f75f395cdeb629" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "8f5d314d4d0ad642" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "315b381da6eee2ad" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.15/aimo_kaggle_hard_pot/results_2024-05-27T23-14-16.360495.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.15/aimo_kaggle_hard_pot/results_2024-05-27T23-14-16.360495.json deleted file mode 100644 index dc2ba9e069f9e0e8065567b9801b0d7d3792c1f5..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.15/aimo_kaggle_hard_pot/results_2024-05-27T23-14-16.360495.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 15584.258641126, - "end_time": 16337.304496648, - "total_evaluation_time_secondes": "753.0458555219993", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "762cda3871e521fda04af63ebddd7d08def292ab", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950606 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.44, - "qem_stderr": 0.07091242083423345 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.32666666666666666, - "qem_stderr": 0.05677838691725182 - }, - "all": { - "qem": 0.32666666666666666, - "qem_stderr": 0.05677838691725182 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "103194d79a59a6ea" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "8ec1edadf4f84280" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "38d26a079a6d3ed0" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "f5a6b8900de50867" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.15/aimo_kaggle_medium_pot/results_2024-05-27T23-09-30.253918.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.15/aimo_kaggle_medium_pot/results_2024-05-27T23-09-30.253918.json deleted file mode 100644 index 797ccc6f8b03e1d505665239001788d36df28011..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.15/aimo_kaggle_medium_pot/results_2024-05-27T23-09-30.253918.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 9261276.777393205, - "end_time": 9261687.95664092, - "total_evaluation_time_secondes": "411.1792477145791", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "762cda3871e521fda04af63ebddd7d08def292ab", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "all": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "3f6c2437ec2778a5" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "fba2ffbf05103564" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "9eeccaa0dcd9b13c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "71e5e9592ec6ff41" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.16/aimo_kaggle_hard_pot/results_2024-05-27T23-08-23.297451.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.16/aimo_kaggle_hard_pot/results_2024-05-27T23-08-23.297451.json deleted file mode 100644 index 70919c33e4d40fe87e20ad5a5cc1092b5fc29a2d..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.16/aimo_kaggle_hard_pot/results_2024-05-27T23-08-23.297451.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2381270.403329353, - "end_time": 2382052.191627778, - "total_evaluation_time_secondes": "781.7882984252647", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "e781ef6d4bd27617d0350151228b758329b25906", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.42, - "qem_stderr": 0.07050835816716035 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.52, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3333333333333333, - "qem_stderr": 0.058602151846797755 - }, - "all": { - "qem": 0.3333333333333333, - "qem_stderr": 0.058602151846797755 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "675d02f16cc029d4" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "f77e246088727761" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "6a2d3c7247898027" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "725726e95a8acbd3" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.16/aimo_kaggle_medium_pot/results_2024-05-27T23-02-25.540852.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.16/aimo_kaggle_medium_pot/results_2024-05-27T23-02-25.540852.json deleted file mode 100644 index 0596202d9e619e303e14ea6322d9eaa80ac1e1a5..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.16/aimo_kaggle_medium_pot/results_2024-05-27T23-02-25.540852.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 212245.513578786, - "end_time": 212657.227623858, - "total_evaluation_time_secondes": "411.7140450720035", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "e781ef6d4bd27617d0350151228b758329b25906", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.06084343084444758 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.225, - "qem_stderr": 0.06686668711812965 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152613 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.16666666666666666, - "qem_stderr": 0.05858285470136779 - }, - "all": { - "qem": 0.16666666666666666, - "qem_stderr": 0.05858285470136779 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "d4d86d4386fce35a" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "6b03353bd976c2eb" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "2c5e3b64c6074403" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "568afee2d36ac488" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.17/aimo_kaggle_hard_pot/results_2024-05-27T23-02-04.265926.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.17/aimo_kaggle_hard_pot/results_2024-05-27T23-02-04.265926.json deleted file mode 100644 index 1e74091f6b0f15c5c9b6ad4fdb1aed7439154a2b..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.17/aimo_kaggle_hard_pot/results_2024-05-27T23-02-04.265926.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 41828.161369794, - "end_time": 42535.282448973, - "total_evaluation_time_secondes": "707.1210791789999", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "a33b7d012bb886cc64d12e1e9c2c160888b6fa4a", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.4, - "qem_stderr": 0.06998542122237653 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.48, - "qem_stderr": 0.0713714056959817 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3133333333333333, - "qem_stderr": 0.05842783953186981 - }, - "all": { - "qem": 0.3133333333333333, - "qem_stderr": 0.05842783953186981 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "7a5369e061fa5896" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "ff00e5472cacce46" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "5db9f75b73ff9503" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "de0cfbea05c70092" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.17/aimo_kaggle_medium_pot/results_2024-05-27T22-58-13.210461.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.17/aimo_kaggle_medium_pot/results_2024-05-27T22-58-13.210461.json deleted file mode 100644 index f5f7ede97844cd3b2f1196a7134a4b89c4a942a4..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.17/aimo_kaggle_medium_pot/results_2024-05-27T22-58-13.210461.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 14973.072152917, - "end_time": 15374.155190019, - "total_evaluation_time_secondes": "401.08303710200016", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "a33b7d012bb886cc64d12e1e9c2c160888b6fa4a", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.225, - "qem_stderr": 0.06686668711812967 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.17500000000000002, - "qem_stderr": 0.060222509023699154 - }, - "all": { - "qem": 0.17500000000000002, - "qem_stderr": 0.060222509023699154 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "f5b56760fc667041" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "bb14f0a2bd06c083" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "d675ba2958f255f0" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "8628c1078cddd43a" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.18/aimo_kaggle_hard_pot/results_2024-05-27T23-04-27.138051.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.18/aimo_kaggle_hard_pot/results_2024-05-27T23-04-27.138051.json deleted file mode 100644 index e2a55a49aa0ec46a16970ea1c6f57a142012d54c..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.18/aimo_kaggle_hard_pot/results_2024-05-27T23-04-27.138051.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 3182937.071528263, - "end_time": 3183617.712269632, - "total_evaluation_time_secondes": "680.6407413692214", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f4852b3b50c1af550e4dd9e22dca71730288bdeb", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.02, - "qem_stderr": 0.02 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.18, - "qem_stderr": 0.054883922035138706 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.56, - "qem_stderr": 0.07091242083423345 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.25333333333333335, - "qem_stderr": 0.04859878095645739 - }, - "all": { - "qem": 0.25333333333333335, - "qem_stderr": 0.04859878095645739 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "bf8dc832d93af4db" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "eb69798598aa592a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "c1f3bb59d3cc42af" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "7525d4b4b8373fd6" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.18/aimo_kaggle_medium_pot/results_2024-05-27T23-00-06.091921.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.18/aimo_kaggle_medium_pot/results_2024-05-27T23-00-06.091921.json deleted file mode 100644 index 3674310d227b0c9fca6554acabb25523f3dc3be8..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.18/aimo_kaggle_medium_pot/results_2024-05-27T23-00-06.091921.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 11491.10456842, - "end_time": 11887.104208122, - "total_evaluation_time_secondes": "395.9996397020004", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f4852b3b50c1af550e4dd9e22dca71730288bdeb", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.025, - "qem_stderr": 0.024999999999999994 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.05, - "qem_stderr": 0.03489912202260563 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.06666666666666667, - "qem_stderr": 0.03761884371037528 - }, - "all": { - "qem": 0.06666666666666667, - "qem_stderr": 0.03761884371037528 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "2df6998e7eb88500" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "b5774c47f6be2a31" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "8b7b43a48fc2fd53" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "670a30b6ff17a138" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.19/aimo_kaggle_hard_pot/results_2024-05-27T23-07-09.942111.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.19/aimo_kaggle_hard_pot/results_2024-05-27T23-07-09.942111.json deleted file mode 100644 index 2b30b3a1d914bb87f024c60f0b97e5c46cbe1d5f..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.19/aimo_kaggle_hard_pot/results_2024-05-27T23-07-09.942111.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 13486.081405334, - "end_time": 14105.767951453, - "total_evaluation_time_secondes": "619.6865461189991", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "1cea9f11e58ea3fb086d517a583f5792b2de295b", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.08, - "qem_stderr": 0.03875617133214441 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.38, - "qem_stderr": 0.06934092056863767 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.15333333333333335, - "qem_stderr": 0.036032363966927355 - }, - "all": { - "qem": 0.15333333333333335, - "qem_stderr": 0.036032363966927355 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "a0a790f62b1388d8" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "5e1bbb9cef526b0e" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "7392530883e39fb5" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "5137b0a42317be31" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.19/aimo_kaggle_medium_pot/results_2024-05-27T23-03-01.740417.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.19/aimo_kaggle_medium_pot/results_2024-05-27T23-03-01.740417.json deleted file mode 100644 index 80061158f27de8c6d8d7cfb1be92a266f209ea71..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.19/aimo_kaggle_medium_pot/results_2024-05-27T23-03-01.740417.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2107832.022622299, - "end_time": 2108158.834683901, - "total_evaluation_time_secondes": "326.81206160224974", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "1cea9f11e58ea3fb086d517a583f5792b2de295b", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.1, - "qem_stderr": 0.048038446141526144 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.03333333333333333, - "qem_stderr": 0.016012815380508715 - }, - "all": { - "qem": 0.03333333333333333, - "qem_stderr": 0.016012815380508715 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "4831552b6aa1f11a" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "e010e4d7b878610b" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "eb307fdcd4d49ac6" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "72a155160763afaa" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.20/aimo_kaggle_hard_pot/results_2024-05-27T23-22-58.427673.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.20/aimo_kaggle_hard_pot/results_2024-05-27T23-22-58.427673.json deleted file mode 100644 index 1776fa13f13d2db500013327e04bd6f37d2562ff..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.20/aimo_kaggle_hard_pot/results_2024-05-27T23-22-58.427673.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 9261753.062522544, - "end_time": 9262496.130185273, - "total_evaluation_time_secondes": "743.0676627289504", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f7b5eb8ab6a29410470f160addf72b5070381d9f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.08, - "qem_stderr": 0.038756171332144415 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.36, - "qem_stderr": 0.06857142857142856 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.52, - "qem_stderr": 0.0713714056959817 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.32, - "qem_stderr": 0.05956633519985156 - }, - "all": { - "qem": 0.32, - "qem_stderr": 0.05956633519985156 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "e35deeb628d6fe74" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "0182d043896b5e7c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "36c648e41d85d715" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "122038fba51079b0" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.20/aimo_kaggle_medium_pot/results_2024-05-27T23-17-58.401823.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.20/aimo_kaggle_medium_pot/results_2024-05-27T23-17-58.401823.json deleted file mode 100644 index 21672c9c50076c94baaf75d3633ea15648dac60f..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.20/aimo_kaggle_medium_pot/results_2024-05-27T23-17-58.401823.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2148224.715635298, - "end_time": 2148651.09183027, - "total_evaluation_time_secondes": "426.37619497207925", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f7b5eb8ab6a29410470f160addf72b5070381d9f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "all": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "fb0b718f572524ae" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "71bc7e12f46da913" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "bb8c28cff5d2fe4c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "87450493cb7bfd84" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.21/aimo_kaggle_hard_pot/results_2024-05-27T23-35-58.717682.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.21/aimo_kaggle_hard_pot/results_2024-05-27T23-35-58.717682.json deleted file mode 100644 index 1795493da9f8453d5322ae46cbb0f0f0378bc37c..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.21/aimo_kaggle_hard_pot/results_2024-05-27T23-35-58.717682.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2382971.546898631, - "end_time": 2383707.611699418, - "total_evaluation_time_secondes": "736.0648007872514", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "7ab2aa592bccd716ed638738b6c9164e46af20b3", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.033926691677251195 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.36, - "qem_stderr": 0.06857142857142856 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.36, - "qem_stderr": 0.06857142857142856 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.26, - "qem_stderr": 0.05702318294003611 - }, - "all": { - "qem": 0.26, - "qem_stderr": 0.05702318294003611 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "1a30a85c38b734ce" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "8c37ee718a5fd5a4" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "733e4c60c9efc7a3" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "16b6151a4659c144" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.21/aimo_kaggle_medium_pot/results_2024-05-27T23-30-10.326809.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.21/aimo_kaggle_medium_pot/results_2024-05-27T23-30-10.326809.json deleted file mode 100644 index b7b7a3533b1be453da232e6cf5ca2813b684a165..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.21/aimo_kaggle_medium_pot/results_2024-05-27T23-30-10.326809.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 43826.907381039, - "end_time": 44221.343392928, - "total_evaluation_time_secondes": "394.43601188900357", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "7ab2aa592bccd716ed638738b6c9164e46af20b3", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152613 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.175, - "qem_stderr": 0.06084343084444758 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.125, - "qem_stderr": 0.052306774375833275 - }, - "all": { - "qem": 0.125, - "qem_stderr": 0.052306774375833275 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "c7fd168c1b8f2513" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "879b4264909aefa4" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "879c4228146bc450" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "26f4ba946de6ee3e" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.22/aimo_kaggle_hard_pot/results_2024-05-27T23-23-30.662062.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.22/aimo_kaggle_hard_pot/results_2024-05-27T23-23-30.662062.json deleted file mode 100644 index 173803ab3f776d90c07e2001d9eeb77e31e2352f..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.22/aimo_kaggle_hard_pot/results_2024-05-27T23-23-30.662062.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 17973.654498274, - "end_time": 18682.562186908, - "total_evaluation_time_secondes": "708.9076886339972", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "bdb1995f9c9eca6d2d0fc76b0f921b143f8e3901", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.34, - "qem_stderr": 0.0676726816132972 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.34, - "qem_stderr": 0.06767268161329719 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.24, - "qem_stderr": 0.05444651057184833 - }, - "all": { - "qem": 0.24, - "qem_stderr": 0.05444651057184833 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "1c5f25a976e2be6c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "e3af54be9201f585" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "e9b5c05422f79c6d" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "defebda25853e21e" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.22/aimo_kaggle_medium_pot/results_2024-05-27T23-19-42.336991.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.22/aimo_kaggle_medium_pot/results_2024-05-27T23-19-42.336991.json deleted file mode 100644 index 382ecc9dea321579472d48e37eab20d0e3c9cf9f..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.22/aimo_kaggle_medium_pot/results_2024-05-27T23-19-42.336991.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1066.478085165, - "end_time": 1425.511067846, - "total_evaluation_time_secondes": "359.032982681", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "bdb1995f9c9eca6d2d0fc76b0f921b143f8e3901", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.225, - "qem_stderr": 0.06686668711812967 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.15833333333333333, - "qem_stderr": 0.05736077358311411 - }, - "all": { - "qem": 0.15833333333333333, - "qem_stderr": 0.05736077358311411 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "0d6c4ad199006a4a" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "56da26b077779316" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "7a8da1926ba6da41" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "2a5d3e88613b08ed" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.23/aimo_kaggle_hard_pot/results_2024-05-27T23-25-22.587556.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.23/aimo_kaggle_hard_pot/results_2024-05-27T23-25-22.587556.json deleted file mode 100644 index 03b8b4c700381023def0e1fa2b6fd752fb1cb153..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.23/aimo_kaggle_hard_pot/results_2024-05-27T23-25-22.587556.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 19828.403239578, - "end_time": 20534.837643038, - "total_evaluation_time_secondes": "706.4344034599999", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "8c5f16dfb4b56e7a9f2114ad755eb3af08d57a77", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.28, - "qem_stderr": 0.06414269805898185 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.48, - "qem_stderr": 0.0713714056959817 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.2733333333333334, - "qem_stderr": 0.05648026514407158 - }, - "all": { - "qem": 0.2733333333333334, - "qem_stderr": 0.05648026514407158 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "c4fe88188f6dbdfd" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "66e07ec6a2f97008" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "f4df881114e46e4e" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "f2a00d84a85817a3" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.23/aimo_kaggle_medium_pot/results_2024-05-27T23-21-43.566972.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.23/aimo_kaggle_medium_pot/results_2024-05-27T23-21-43.566972.json deleted file mode 100644 index 075d341b430e99c2d30df5c4fdf52cc12623983b..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.23/aimo_kaggle_medium_pot/results_2024-05-27T23-21-43.566972.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 16393.297866342, - "end_time": 16784.511684767, - "total_evaluation_time_secondes": "391.2138184249998", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "8c5f16dfb4b56e7a9f2114ad755eb3af08d57a77", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447564 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152613 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.13333333333333333, - "qem_stderr": 0.053946428698164635 - }, - "all": { - "qem": 0.13333333333333333, - "qem_stderr": 0.053946428698164635 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "2128dae826f7ee05" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "0cee5a33cfbf4be1" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "421d4bef3e69a5ce" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "b8fe67c3e2484808" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.24/aimo_kaggle_hard_pot/results_2024-05-27T23-15-02.538862.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.24/aimo_kaggle_hard_pot/results_2024-05-27T23-15-02.538862.json deleted file mode 100644 index 06d7462f992b60b47b7dc4d3c5ec2dcdb65280ed..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.24/aimo_kaggle_hard_pot/results_2024-05-27T23-15-02.538862.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 46235.735212576, - "end_time": 46977.393914516, - "total_evaluation_time_secondes": "741.6587019399958", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "00c131eb873eb25cafa3dd2ed4f3df2d299ae974", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.34, - "qem_stderr": 0.0676726816132972 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.48, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.29333333333333333, - "qem_stderr": 0.05765692632884337 - }, - "all": { - "qem": 0.29333333333333333, - "qem_stderr": 0.05765692632884337 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "2ab623f94429894f" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "0b82ff116cbccc23" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "65a24b7b1d474d4a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "66759b6936c0c6e6" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.24/aimo_kaggle_medium_pot/results_2024-05-27T23-10-09.952224.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.24/aimo_kaggle_medium_pot/results_2024-05-27T23-10-09.952224.json deleted file mode 100644 index 39162435cfbe9b463e696f4a3483349a3e2b398a..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.24/aimo_kaggle_medium_pot/results_2024-05-27T23-10-09.952224.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 309077.641450251, - "end_time": 309494.11537162, - "total_evaluation_time_secondes": "416.47392136900453", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "00c131eb873eb25cafa3dd2ed4f3df2d299ae974", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.06084343084444758 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.2, - "qem_stderr": 0.06405126152203486 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.18333333333333335, - "qem_stderr": 0.06191270773697668 - }, - "all": { - "qem": 0.18333333333333335, - "qem_stderr": 0.06191270773697668 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "b61b85f4c47050bf" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "670b75edfe9beaa5" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "a20be6920c0e9fef" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "d07792a8568f71fd" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.25/aimo_kaggle_hard_pot/results_2024-05-27T23-10-01.696689.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.25/aimo_kaggle_hard_pot/results_2024-05-27T23-10-01.696689.json deleted file mode 100644 index cc659922051c63cd758cc0fcc1268e7b837e0d33..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.25/aimo_kaggle_hard_pot/results_2024-05-27T23-10-01.696689.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 17131.327505868, - "end_time": 17873.596676263, - "total_evaluation_time_secondes": "742.2691703950004", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "da5e99bc709858f093b72041c04224cdcc0c9b4f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.48, - "qem_stderr": 0.0713714056959817 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.48, - "qem_stderr": 0.07137140569598172 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.34, - "qem_stderr": 0.05888983435640487 - }, - "all": { - "qem": 0.34, - "qem_stderr": 0.05888983435640487 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "0186ae187c82cfda" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "02a942c5f2c8ac72" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "1873ddb5f2ebd3f3" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "de704170b685bdce" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.25/aimo_kaggle_medium_pot/results_2024-05-27T23-08-31.143898.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.25/aimo_kaggle_medium_pot/results_2024-05-27T23-08-31.143898.json deleted file mode 100644 index 8be1906aa87f9f8230ea46c49d25d18e3bca45e6..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.25/aimo_kaggle_medium_pot/results_2024-05-27T23-08-31.143898.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 44321.869415699, - "end_time": 44731.010502157, - "total_evaluation_time_secondes": "409.1410864579957", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "da5e99bc709858f093b72041c04224cdcc0c9b4f", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.125, - "qem_stderr": 0.05295740910852021 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152613 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.13333333333333333, - "qem_stderr": 0.05394642869816465 - }, - "all": { - "qem": 0.13333333333333333, - "qem_stderr": 0.05394642869816465 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "8df0a52674e6e021" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "189c36a7bd608cd2" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "adcc693c491edf37" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "1f56336cc22f00fe" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.26/aimo_kaggle_hard_pot/results_2024-05-27T23-12-35.390619.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.26/aimo_kaggle_hard_pot/results_2024-05-27T23-12-35.390619.json deleted file mode 100644 index 7d5220a60087129c79cd18eaeff2ba7f2668c1be..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.26/aimo_kaggle_hard_pot/results_2024-05-27T23-12-35.390619.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 357.745513553, - "end_time": 998.564658689, - "total_evaluation_time_secondes": "640.819145136", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "b0d03c59db7dd54d68a3530a0be7a4657bde3e05", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950606 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.06, - "qem_stderr": 0.0339266916772512 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.36, - "qem_stderr": 0.06857142857142856 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.15333333333333332, - "qem_stderr": 0.04349742957921012 - }, - "all": { - "qem": 0.15333333333333332, - "qem_stderr": 0.04349742957921012 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "887946e13bef0181" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "2db8a014eb1a235c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "de976e1ecdaf18a1" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "e6362b8300f1421c" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.26/aimo_kaggle_medium_pot/results_2024-05-27T23-08-02.054042.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.26/aimo_kaggle_medium_pot/results_2024-05-27T23-08-02.054042.json deleted file mode 100644 index f4e2ea5623ac17f0b8e395320674ba38268a33ac..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.26/aimo_kaggle_medium_pot/results_2024-05-27T23-08-02.054042.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 108280.51491864, - "end_time": 108660.985004205, - "total_evaluation_time_secondes": "380.4700855650008", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "b0d03c59db7dd54d68a3530a0be7a4657bde3e05", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.025, - "qem_stderr": 0.024999999999999998 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.075, - "qem_stderr": 0.04217636961434867 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.03333333333333333, - "qem_stderr": 0.02239212320478289 - }, - "all": { - "qem": 0.03333333333333333, - "qem_stderr": 0.02239212320478289 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "63cb77e04a5513e9" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "8a60a479dfe4052d" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "d46223f78025db04" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "fd8544b754908048" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.27/aimo_kaggle_hard_pot/results_2024-05-27T23-19-28.994875.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.27/aimo_kaggle_hard_pot/results_2024-05-27T23-19-28.994875.json deleted file mode 100644 index e75ecf617f6ea49d0da8318b77b06593fd53e438..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.27/aimo_kaggle_hard_pot/results_2024-05-27T23-19-28.994875.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 14168.221169878, - "end_time": 14844.852405423, - "total_evaluation_time_secondes": "676.6312355450009", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "ee065fd41a7443f27111d8250276086b0f7ee211", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.22, - "qem_stderr": 0.05917804336345138 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.48, - "qem_stderr": 0.0713714056959817 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.24666666666666667, - "qem_stderr": 0.0528478725161279 - }, - "all": { - "qem": 0.24666666666666667, - "qem_stderr": 0.0528478725161279 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "a907b37175c77148" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "d90bb64d1c661009" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "5f85e17c73179ec5" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "0ccacd45501c0f00" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.27/aimo_kaggle_medium_pot/results_2024-05-27T23-15-00.970063.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.27/aimo_kaggle_medium_pot/results_2024-05-27T23-15-00.970063.json deleted file mode 100644 index 763236db798f676ec430223c4b1b39afdaa34df4..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.27/aimo_kaggle_medium_pot/results_2024-05-27T23-15-00.970063.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 108724.255171356, - "end_time": 109079.901682823, - "total_evaluation_time_secondes": "355.6465114669991", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "ee065fd41a7443f27111d8250276086b0f7ee211", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.025, - "qem_stderr": 0.024999999999999994 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.075, - "qem_stderr": 0.04217636961434867 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.05, - "qem_stderr": 0.03489912202260563 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.05000000000000001, - "qem_stderr": 0.034025163878984764 - }, - "all": { - "qem": 0.05000000000000001, - "qem_stderr": 0.034025163878984764 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "c194a5aeb5cd15b0" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "9c99f6b8687ed6ff" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "c6702d79c128a4b7" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "e404c1f4504c5de1" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.28/aimo_kaggle_hard_pot/results_2024-05-27T23-14-57.845464.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.28/aimo_kaggle_hard_pot/results_2024-05-27T23-14-57.845464.json deleted file mode 100644 index f5c11dd538cd0b38bdaef842e20039d1f24c55ce..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.28/aimo_kaggle_hard_pot/results_2024-05-27T23-14-57.845464.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 42602.218306077, - "end_time": 43308.861813697, - "total_evaluation_time_secondes": "706.6435076200069", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0766bc581672537e284670604dede2c4cdd5e736", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.08, - "qem_stderr": 0.03875617133214439 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.34, - "qem_stderr": 0.0676726816132972 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.46, - "qem_stderr": 0.07119963311072637 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.2933333333333334, - "qem_stderr": 0.05920949535205599 - }, - "all": { - "qem": 0.2933333333333334, - "qem_stderr": 0.05920949535205599 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "fa697ce9faaed6f5" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "cbf56cff582f92f7" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "d7681484f89c400c" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "0f8e32bf1858b777" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.28/aimo_kaggle_medium_pot/results_2024-05-27T23-12-23.523444.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.28/aimo_kaggle_medium_pot/results_2024-05-27T23-12-23.523444.json deleted file mode 100644 index 008f90dd01dffc3c19d971c3f212ef455cdb5589..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.28/aimo_kaggle_medium_pot/results_2024-05-27T23-12-23.523444.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 19347.722007431, - "end_time": 19755.773906724, - "total_evaluation_time_secondes": "408.05189929300104", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0766bc581672537e284670604dede2c4cdd5e736", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.25, - "qem_stderr": 0.06933752452815363 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.17500000000000002, - "qem_stderr": 0.059406467171375786 - }, - "all": { - "qem": 0.17500000000000002, - "qem_stderr": 0.059406467171375786 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "7bea08cd8a604137" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "f7ca149e8d0ae304" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "32e2c81b132898ef" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "a1eb49d9309eda80" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.29/aimo_kaggle_hard_pot/results_2024-05-27T23-31-51.594751.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.29/aimo_kaggle_hard_pot/results_2024-05-27T23-31-51.594751.json deleted file mode 100644 index 2c8866fccb929da2a530eeaaadeb19b2d7996016..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.29/aimo_kaggle_hard_pot/results_2024-05-27T23-31-51.594751.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 14915.190182791, - "end_time": 15587.452266547, - "total_evaluation_time_secondes": "672.2620837559989", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f1214898840073676ea55543845d6a8b4b023540", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.1, - "qem_stderr": 0.04285714285714283 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.24, - "qem_stderr": 0.06101187572589322 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.27999999999999997, - "qem_stderr": 0.05843253000386916 - }, - "all": { - "qem": 0.27999999999999997, - "qem_stderr": 0.05843253000386916 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "02a8a0633684a7fd" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "12aaba76f45e2e8b" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "f6daabeffefca86a" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "7826fa0d8e510b0b" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.29/aimo_kaggle_medium_pot/results_2024-05-27T23-28-41.744997.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.29/aimo_kaggle_medium_pot/results_2024-05-27T23-28-41.744997.json deleted file mode 100644 index 99151b0e83ce39d1dbab78ddc40232f092571b9a..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.29/aimo_kaggle_medium_pot/results_2024-05-27T23-28-41.744997.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 16837.34058526, - "end_time": 17202.689626427, - "total_evaluation_time_secondes": "365.3490411669991", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "f1214898840073676ea55543845d6a8b4b023540", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.175, - "qem_stderr": 0.060843430844447585 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.14166666666666666, - "qem_stderr": 0.055353021491886756 - }, - "all": { - "qem": 0.14166666666666666, - "qem_stderr": 0.055353021491886756 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "84f2cb2204195e74" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "624c6a98d5ee47a6" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "7ffd73322fdbb06c" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "dd1452adf37215d0" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.30/aimo_kaggle_hard_pot/results_2024-05-27T23-30-31.887877.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.30/aimo_kaggle_hard_pot/results_2024-05-27T23-30-31.887877.json deleted file mode 100644 index cbe6f1fdb1aefb4313fffec67c53c7ee2812199a..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.30/aimo_kaggle_hard_pot/results_2024-05-27T23-30-31.887877.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2148719.20755836, - "end_time": 2149404.577634314, - "total_evaluation_time_secondes": "685.3700759541243", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0ebaa7d73082df6393d6be3d78965ddbf975ef1b", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.02, - "qem_stderr": 0.02 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.28, - "qem_stderr": 0.06414269805898186 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.26666666666666666, - "qem_stderr": 0.05185708982918443 - }, - "all": { - "qem": 0.26666666666666666, - "qem_stderr": 0.05185708982918443 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "6f8681ae229ace40" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "109cf33a4720f79d" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "ea3c0f61a3ea5c9e" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "1e4b14191b6ab0ca" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.30/aimo_kaggle_medium_pot/results_2024-05-27T23-27-07.325452.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.30/aimo_kaggle_medium_pot/results_2024-05-27T23-27-07.325452.json deleted file mode 100644 index 84260aea726a206f5daa3be5e8df1ead72cf71d6..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.30/aimo_kaggle_medium_pot/results_2024-05-27T23-27-07.325452.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1479.79660182, - "end_time": 1870.499579863, - "total_evaluation_time_secondes": "390.70297804300003", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "0ebaa7d73082df6393d6be3d78965ddbf975ef1b", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.075, - "qem_stderr": 0.04217636961434867 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.125, - "qem_stderr": 0.052176914864573924 - }, - "all": { - "qem": 0.125, - "qem_stderr": 0.052176914864573924 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "9067c4adbe58e0af" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "60bb7a8467b01b4b" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "e624d9cae8835960" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "a9f259ca44ffb300" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.31/aimo_kaggle_hard_pot/results_2024-05-27T23-27-53.025134.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.31/aimo_kaggle_hard_pot/results_2024-05-27T23-27-53.025134.json deleted file mode 100644 index 33989157806d711af1ea29edc811b9616a2a0945..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.31/aimo_kaggle_hard_pot/results_2024-05-27T23-27-53.025134.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 47046.619333823, - "end_time": 47747.879489815, - "total_evaluation_time_secondes": "701.2601559920004", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "ff604d66837e06ca9e08144afce56bb8b9908af0", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.02799416848895062 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.16, - "qem_stderr": 0.05237229365663815 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.44, - "qem_stderr": 0.07091242083423345 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.21333333333333335, - "qem_stderr": 0.05042629432660741 - }, - "all": { - "qem": 0.21333333333333335, - "qem_stderr": 0.05042629432660741 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "3c670fa0e80bc301", - "hash_input_tokens": "1bd4187ad6963415", - "hash_cont_tokens": "5659a661082f075e" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "09e7759a96f64e59", - "hash_input_tokens": "dc7f68cf14be3d61", - "hash_cont_tokens": "3e98ef35876d7a1d" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "65f8b95ea99c7087", - "hash_input_tokens": "bddb67459e7601c5", - "hash_cont_tokens": "76118b3f8db8d774" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "7f4c38eb08b3bb41", - "hash_input_tokens": "16f0fd76e0a2a165", - "hash_cont_tokens": "ae61832c797d874f" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.31/aimo_kaggle_medium_pot/results_2024-05-27T23-22-30.784350.json b/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.31/aimo_kaggle_medium_pot/results_2024-05-27T23-22-30.784350.json deleted file mode 100644 index d65a45343c6a0be4880a499a4888797cc79e7767..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-rl-sft/aimo_v02.31/aimo_kaggle_medium_pot/results_2024-05-27T23-22-30.784350.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 43382.704680433, - "end_time": 43761.800901726, - "total_evaluation_time_secondes": "379.0962212929953", - "model_name": "AI-MO/deepseek-math-7b-rl-sft", - "model_sha": "ff604d66837e06ca9e08144afce56bb8b9908af0", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.0, - "qem_stderr": 0.0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.075, - "qem_stderr": 0.04217636961434869 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.05, - "qem_stderr": 0.03489912202260563 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.041666666666666664, - "qem_stderr": 0.02569183054565144 - }, - "all": { - "qem": 0.041666666666666664, - "qem_stderr": 0.02569183054565144 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "de6572b914d5bba6", - "hash_input_tokens": "5af82ee284ccce29", - "hash_cont_tokens": "735ff714ad41ce64" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "427d5de6e4d90df2", - "hash_input_tokens": "be3dcf9f8a2350d0", - "hash_cont_tokens": "34569bc705ccfa93" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "6bb90c129d6d6123", - "hash_input_tokens": "22eeaedbeb3f56f0", - "hash_cont_tokens": "b97501805c03172f" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "3aca0fad61a42921", - "hash_input_tokens": "c7d2c8bb76f8ecb1", - "hash_cont_tokens": "f39c72bc38e52cd2" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-25T11-37-18.262121.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-25T11-37-18.262121.json deleted file mode 100644 index 8053885eed13bbb45c86d4fd7c3002984cbe5eb8..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-25T11-37-18.262121.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 152290.881157689, - "end_time": 152722.068506006, - "total_evaluation_time_secondes": "431.18734831700567", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - }, - "all": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "bfd073e62f180246", - "hash_input_tokens": "64e5a16b7fd939f2", - "hash_cont_tokens": "d1e39db725a2bede" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "cae48890a7c47904", - "hash_full_prompts": "10d2787b122d820a", - "hash_input_tokens": "8968792b9607aaea", - "hash_cont_tokens": "0684970aa9d4d3cf" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T12-09-33.969518.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T12-09-33.969518.json deleted file mode 100644 index 109ac0a064326e978747b2d9672eafccfd34f872..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T12-09-33.969518.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2104551.230804201, - "end_time": 2104949.915568033, - "total_evaluation_time_secondes": "398.6847638315521", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - }, - "all": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "bfd073e62f180246", - "hash_input_tokens": "64e5a16b7fd939f2", - "hash_cont_tokens": "d1e39db725a2bede" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "cae48890a7c47904", - "hash_full_prompts": "10d2787b122d820a", - "hash_input_tokens": "8968792b9607aaea", - "hash_cont_tokens": "0684970aa9d4d3cf" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T12-36-55.986020.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T12-36-55.986020.json deleted file mode 100644 index a920c49cfe8faf959390e6b882861bc0d536b229..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T12-36-55.986020.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 3547926.103414035, - "end_time": 3548332.741776327, - "total_evaluation_time_secondes": "406.6383622922003", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - }, - "all": { - "qem": 0.04, - "qem_stderr": 0.027994168488950612 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "bfd073e62f180246", - "hash_input_tokens": "64e5a16b7fd939f2", - "hash_cont_tokens": "d1e39db725a2bede" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "cae48890a7c47904", - "hash_full_prompts": "10d2787b122d820a", - "hash_input_tokens": "8968792b9607aaea", - "hash_cont_tokens": "0684970aa9d4d3cf" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 35, - "non_padded": 15, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T13-15-12.778072.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T13-15-12.778072.json deleted file mode 100644 index 3ba319e7763e76d5586fd89c7203aa3bceb5673c..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_hard_pot/results_2024-05-27T13-15-12.778072.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2513154.846120501, - "end_time": 2513892.786362526, - "total_evaluation_time_secondes": "737.9402420250699", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "qem": 0.06, - "qem_stderr": 0.033926691677251195 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "qem": 0.5, - "qem_stderr": 0.07142857142857142 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "qem": 0.48, - "qem_stderr": 0.0713714056959817 - }, - "custom|aimo_kaggle_hard_pot:_average|0": { - "qem": 0.3466666666666667, - "qem_stderr": 0.05890888960060144 - }, - "all": { - "qem": 0.3466666666666667, - "qem_stderr": 0.05890888960060144 - } - }, - "versions": { - "custom|aimo_kaggle_hard_pot:v0|0": 0, - "custom|aimo_kaggle_hard_pot:v1|0": 0, - "custom|aimo_kaggle_hard_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_hard_pot:v0": { - "name": "aimo_kaggle_hard_pot:v0", - "prompt_function": "kaggle_hard_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v1": { - "name": "aimo_kaggle_hard_pot:v1", - "prompt_function": "kaggle_hard_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_hard_pot:v2": { - "name": "aimo_kaggle_hard_pot:v2", - "prompt_function": "kaggle_hard_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-hard", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_hard_pot:v0|0": { - "hashes": { - "hash_examples": "303213a38d9f7512", - "hash_full_prompts": "bfd073e62f180246", - "hash_input_tokens": "a647c539b203fcea", - "hash_cont_tokens": "6caf60e1da3d5c18" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v1|0": { - "hashes": { - "hash_examples": "e4234b97ad92862f", - "hash_full_prompts": "4747f0cd9a10355c", - "hash_input_tokens": "646c64e34f75a472", - "hash_cont_tokens": "e3cba0874e9917ff" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 32, - "non_padded": 18, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_hard_pot:v2|0": { - "hashes": { - "hash_examples": "6396eb8833e13ba0", - "hash_full_prompts": "0584a2707d3a5d56", - "hash_input_tokens": "e1aae06d528de511", - "hash_cont_tokens": "936407d9365e6d42" - }, - "truncated": 50, - "non_truncated": 0, - "padded": 30, - "non_padded": 20, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "648c9a107d279e1e", - "hash_full_prompts": "c13504bf8e62491b", - "hash_input_tokens": "753d23e4899a9f2d", - "hash_cont_tokens": "0598003096b8278a" - }, - "truncated": 150, - "non_truncated": 0, - "padded": 99, - "non_padded": 51, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-25T11-36-15.944044.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-25T11-36-15.944044.json deleted file mode 100644 index 14295769799c0ad18317f7f8cd03d0148010739c..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-25T11-36-15.944044.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 1384398.476599932, - "end_time": 1384759.309376844, - "total_evaluation_time_secondes": "360.8327769120224", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "all": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "2af864fbfc2e0a79", - "hash_input_tokens": "6d183a5316cf78dd", - "hash_cont_tokens": "16eacd06184a4f00" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c72202b0a18ef5f9", - "hash_full_prompts": "28935f1035e9b20f", - "hash_input_tokens": "fc52828e51666a24", - "hash_cont_tokens": "baccad0da35ef263" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T12-07-53.902602.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T12-07-53.902602.json deleted file mode 100644 index c1965c189c136810a05cc9a57efcae5f9a0c2912..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T12-07-53.902602.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 18340.891879911, - "end_time": 18656.808616029, - "total_evaluation_time_secondes": "315.9167361180007", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "all": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "2af864fbfc2e0a79", - "hash_input_tokens": "6d183a5316cf78dd", - "hash_cont_tokens": "16eacd06184a4f00" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c72202b0a18ef5f9", - "hash_full_prompts": "28935f1035e9b20f", - "hash_input_tokens": "fc52828e51666a24", - "hash_cont_tokens": "baccad0da35ef263" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T12-35-32.569923.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T12-35-32.569923.json deleted file mode 100644 index 05f1289556ddf1cbb488e3d04fb372599cc86f6e..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T12-35-32.569923.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 2109781.270822056, - "end_time": 2110105.26031961, - "total_evaluation_time_secondes": "323.9894975540228", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "all": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "2af864fbfc2e0a79", - "hash_input_tokens": "6d183a5316cf78dd", - "hash_cont_tokens": "16eacd06184a4f00" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "c72202b0a18ef5f9", - "hash_full_prompts": "28935f1035e9b20f", - "hash_input_tokens": "fc52828e51666a24", - "hash_cont_tokens": "baccad0da35ef263" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T13-09-00.410265.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T13-09-00.410265.json deleted file mode 100644 index e61b7dbc83311e2fb4443733189617fa2a523ef9..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_medium_pot/results_2024-05-27T13-09-00.410265.json +++ /dev/null @@ -1,193 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 3549890.324402638, - "end_time": 3550257.165589795, - "total_evaluation_time_secondes": "366.8411871572025", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "qem": 0.15, - "qem_stderr": 0.05717718748968655 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "qem": 0.05, - "qem_stderr": 0.03489912202260563 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "qem": 0.1, - "qem_stderr": 0.04803844614152612 - }, - "custom|aimo_kaggle_medium_pot:_average|0": { - "qem": 0.10000000000000002, - "qem_stderr": 0.04670491855127277 - }, - "all": { - "qem": 0.10000000000000002, - "qem_stderr": 0.04670491855127277 - } - }, - "versions": { - "custom|aimo_kaggle_medium_pot:v0|0": 0, - "custom|aimo_kaggle_medium_pot:v1|0": 0, - "custom|aimo_kaggle_medium_pot:v2|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_medium_pot:v0": { - "name": "aimo_kaggle_medium_pot:v0", - "prompt_function": "kaggle_medium_pot_prompt_fn_v0", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v1": { - "name": "aimo_kaggle_medium_pot:v1", - "prompt_function": "kaggle_medium_pot_prompt_fn_v1", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - }, - "custom|aimo_kaggle_medium_pot:v2": { - "name": "aimo_kaggle_medium_pot:v2", - "prompt_function": "kaggle_medium_pot_prompt_fn_v2", - "hf_repo": "AI-MO/kaggle-validation-set-medium", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "num_samples": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 40, - "effective_num_docs": 40, - "trust_dataset": null, - "must_remove_duplicate_docs": null, - "version": 0 - } - }, - "summary_tasks": { - "custom|aimo_kaggle_medium_pot:v0|0": { - "hashes": { - "hash_examples": "2799c24461029dc3", - "hash_full_prompts": "2af864fbfc2e0a79", - "hash_input_tokens": "324d7ae112f05205", - "hash_cont_tokens": "d218a668ae2a7c0e" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 29, - "non_padded": 11, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v1|0": { - "hashes": { - "hash_examples": "806b2e2056b41f84", - "hash_full_prompts": "8123a0d96a6ceb9d", - "hash_input_tokens": "b1f2be384f5fe5f1", - "hash_cont_tokens": "93f647c730e1b030" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 26, - "non_padded": 14, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - }, - "custom|aimo_kaggle_medium_pot:v2|0": { - "hashes": { - "hash_examples": "d8534375acc5d427", - "hash_full_prompts": "71ba7c8172fec45a", - "hash_input_tokens": "3e80be021360103e", - "hash_cont_tokens": "dee04650e96d2ad9" - }, - "truncated": 40, - "non_truncated": 0, - "padded": 31, - "non_padded": 9, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "623505a45a4910c2", - "hash_full_prompts": "0ee7c8ef786b9aa3", - "hash_input_tokens": "e7650dee68c62549", - "hash_cont_tokens": "a39cffed37fbefef" - }, - "truncated": 120, - "non_truncated": 0, - "padded": 86, - "non_padded": 34, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file diff --git a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_pot/results_2024-05-25T11-36-10.990397.json b/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_pot/results_2024-05-25T11-36-10.990397.json deleted file mode 100644 index 48b51a7b12b5e00a1ba1916a9e02cd3ae1ca6265..0000000000000000000000000000000000000000 --- a/eval_results/AI-MO/deepseek-math-7b-sft/aimo_v20.20/aimo_kaggle_pot/results_2024-05-25T11-36-10.990397.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "config_general": { - "lighteval_sha": "?", - "num_fewshot_seeds": 1, - "override_batch_size": 4, - "max_samples": null, - "job_id": "", - "start_time": 915838.823205157, - "end_time": 916198.029995119, - "total_evaluation_time_secondes": "359.20678996201605", - "model_name": "AI-MO/deepseek-math-7b-sft", - "model_sha": "9fa67f9d2f552fcf839660af12c5f7ddd0d7cf00", - "model_dtype": "torch.bfloat16", - "model_size": "12.93 GB", - "config": null - }, - "results": { - "custom|aimo_kaggle_pot:v0|0": { - "qem": 0.3, - "qem_stderr": 0.0654653670707977 - }, - "all": { - "qem": 0.3, - "qem_stderr": 0.0654653670707977 - } - }, - "versions": { - "custom|aimo_kaggle_pot:v0|0": 0 - }, - "config_tasks": { - "custom|aimo_kaggle_pot:v0": { - "name": "aimo_kaggle_pot:v0", - "prompt_function": "kaggle_pot_prompt_fn", - "hf_repo": "AI-MO/kaggle-validation-set", - "hf_subset": "v0", - "metric": [ - "quasi_exact_match_code_and_math" - ], - "hf_avail_splits": [ - "train" - ], - "evaluation_splits": [ - "train" - ], - "few_shots_split": null, - "few_shots_select": null, - "generation_size": 2048, - "stop_sequence": null, - "output_regex": null, - "frozen": false, - "suite": [ - "custom" - ], - "original_num_docs": 50, - "effective_num_docs": 50, - "trust_dataset": null, - "must_remove_duplicate_docs": null - } - }, - "summary_tasks": { - "custom|aimo_kaggle_pot:v0|0": { - "hashes": { - "hash_examples": "631fc2e76ca58b0a", - "hash_full_prompts": "b1f5e7e9d5861f25", - "hash_input_tokens": "332d9a951e2af786", - "hash_cont_tokens": "d31d87152ce3d62d" - }, - "truncated": 48, - "non_truncated": 2, - "padded": 37, - "non_padded": 13, - "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 - } - }, - "summary_general": { - "hashes": { - "hash_examples": "58058a251072a7cd", - "hash_full_prompts": "2471c91834f9103b", - "hash_input_tokens": "5c6a4de72e252926", - "hash_cont_tokens": "4c90a320ce5a9862" - }, - "truncated": 48, - "non_truncated": 2, - "padded": 37, - "non_padded": 13, - "num_truncated_few_shots": 0 - } -} \ No newline at end of file