open-r1-eval-leaderboard
/
eval_results
/kaist-ai
/mistral-orpo-alpha
/main
/agieval
/results_2024-03-28T19-14-46.723246.json
{ | |
"config_general": { | |
"lighteval_sha": "?", | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "", | |
"start_time": 1626308.847028699, | |
"end_time": 1626593.843652943, | |
"total_evaluation_time_secondes": "284.99662424391136", | |
"model_name": "kaist-ai/mistral-orpo-alpha", | |
"model_sha": "fe1724b77567a2a3579e74240a69a66622191735", | |
"model_dtype": "torch.bfloat16", | |
"model_size": "13.99 GB", | |
"config": null | |
}, | |
"results": { | |
"lighteval|agieval:aqua-rat|0": { | |
"acc": 0.23228346456692914, | |
"acc_stderr": 0.026549071327684932, | |
"acc_norm": 0.24015748031496062, | |
"acc_norm_stderr": 0.026856511194355212 | |
}, | |
"lighteval|agieval:gaokao-biology|0": { | |
"acc": 0.3238095238095238, | |
"acc_stderr": 0.03236727895404352, | |
"acc_norm": 0.32857142857142857, | |
"acc_norm_stderr": 0.03248939796876844 | |
}, | |
"lighteval|agieval:gaokao-chemistry|0": { | |
"acc": 0.2946859903381642, | |
"acc_stderr": 0.03176416108295297, | |
"acc_norm": 0.28019323671497587, | |
"acc_norm_stderr": 0.031289827964521094 | |
}, | |
"lighteval|agieval:gaokao-chinese|0": { | |
"acc": 0.2682926829268293, | |
"acc_stderr": 0.028306754023121848, | |
"acc_norm": 0.2601626016260163, | |
"acc_norm_stderr": 0.028028995361669366 | |
}, | |
"lighteval|agieval:gaokao-english|0": { | |
"acc": 0.6993464052287581, | |
"acc_stderr": 0.026256053835718968, | |
"acc_norm": 0.6437908496732027, | |
"acc_norm_stderr": 0.02742047766262924 | |
}, | |
"lighteval|agieval:gaokao-geography|0": { | |
"acc": 0.4723618090452261, | |
"acc_stderr": 0.035479125346565575, | |
"acc_norm": 0.45226130653266333, | |
"acc_norm_stderr": 0.03537112167025914 | |
}, | |
"lighteval|agieval:gaokao-history|0": { | |
"acc": 0.4085106382978723, | |
"acc_stderr": 0.03213418026701576, | |
"acc_norm": 0.37446808510638296, | |
"acc_norm_stderr": 0.03163910665367291 | |
}, | |
"lighteval|agieval:gaokao-mathqa|0": { | |
"acc": 0.2535612535612536, | |
"acc_stderr": 0.023254366364417835, | |
"acc_norm": 0.23646723646723647, | |
"acc_norm_stderr": 0.022712519049117575 | |
}, | |
"lighteval|agieval:gaokao-physics|0": { | |
"acc": 0.32, | |
"acc_stderr": 0.03306761764450865, | |
"acc_norm": 0.355, | |
"acc_norm_stderr": 0.033920910080708536 | |
}, | |
"lighteval|agieval:logiqa-en|0": { | |
"acc": 0.3456221198156682, | |
"acc_stderr": 0.018653416684754197, | |
"acc_norm": 0.3486943164362519, | |
"acc_norm_stderr": 0.018692104055797923 | |
}, | |
"lighteval|agieval:logiqa-zh|0": { | |
"acc": 0.3241167434715822, | |
"acc_stderr": 0.01835819163513243, | |
"acc_norm": 0.35023041474654376, | |
"acc_norm_stderr": 0.018711126732221216 | |
}, | |
"lighteval|agieval:lsat-ar|0": { | |
"acc": 0.1782608695652174, | |
"acc_stderr": 0.025291655246273914, | |
"acc_norm": 0.17391304347826086, | |
"acc_norm_stderr": 0.025047317386049713 | |
}, | |
"lighteval|agieval:lsat-lr|0": { | |
"acc": 0.403921568627451, | |
"acc_stderr": 0.021749102234028657, | |
"acc_norm": 0.34901960784313724, | |
"acc_norm_stderr": 0.021127590746109415 | |
}, | |
"lighteval|agieval:lsat-rc|0": { | |
"acc": 0.5092936802973977, | |
"acc_stderr": 0.030537084593525398, | |
"acc_norm": 0.3680297397769517, | |
"acc_norm_stderr": 0.029459297142360175 | |
}, | |
"lighteval|agieval:sat-en|0": { | |
"acc": 0.6504854368932039, | |
"acc_stderr": 0.03330232052876045, | |
"acc_norm": 0.5339805825242718, | |
"acc_norm_stderr": 0.03484077510348 | |
}, | |
"lighteval|agieval:sat-en-without-passage|0": { | |
"acc": 0.4174757281553398, | |
"acc_stderr": 0.03444258173919335, | |
"acc_norm": 0.3640776699029126, | |
"acc_norm_stderr": 0.03360641055142782 | |
}, | |
"lighteval|agieval:sat-math|0": { | |
"acc": 0.35, | |
"acc_stderr": 0.03223061875589932, | |
"acc_norm": 0.30454545454545456, | |
"acc_norm_stderr": 0.03109842385860254 | |
}, | |
"lighteval|agieval:_average|0": { | |
"acc": 0.3795310538000245, | |
"acc_stderr": 0.0284555047213881, | |
"acc_norm": 0.3507978267212148, | |
"acc_norm_stderr": 0.028371289010691192 | |
}, | |
"all": { | |
"acc": 0.3795310538000245, | |
"acc_stderr": 0.0284555047213881, | |
"acc_norm": 0.3507978267212148, | |
"acc_norm_stderr": 0.028371289010691192 | |
} | |
}, | |
"versions": { | |
"lighteval|agieval:aqua-rat|0": 0, | |
"lighteval|agieval:gaokao-biology|0": 0, | |
"lighteval|agieval:gaokao-chemistry|0": 0, | |
"lighteval|agieval:gaokao-chinese|0": 0, | |
"lighteval|agieval:gaokao-english|0": 0, | |
"lighteval|agieval:gaokao-geography|0": 0, | |
"lighteval|agieval:gaokao-history|0": 0, | |
"lighteval|agieval:gaokao-mathqa|0": 0, | |
"lighteval|agieval:gaokao-physics|0": 0, | |
"lighteval|agieval:logiqa-en|0": 0, | |
"lighteval|agieval:logiqa-zh|0": 0, | |
"lighteval|agieval:lsat-ar|0": 0, | |
"lighteval|agieval:lsat-lr|0": 0, | |
"lighteval|agieval:lsat-rc|0": 0, | |
"lighteval|agieval:sat-en|0": 0, | |
"lighteval|agieval:sat-en-without-passage|0": 0, | |
"lighteval|agieval:sat-math|0": 0 | |
}, | |
"config_tasks": { | |
"lighteval|agieval:aqua-rat": { | |
"name": "agieval:aqua-rat", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-aqua-rat", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 254, | |
"effective_num_docs": 254, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-biology": { | |
"name": "agieval:gaokao-biology", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-biology", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 210, | |
"effective_num_docs": 210, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-chemistry": { | |
"name": "agieval:gaokao-chemistry", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-chemistry", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 207, | |
"effective_num_docs": 207, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-chinese": { | |
"name": "agieval:gaokao-chinese", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-chinese", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 246, | |
"effective_num_docs": 246, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-english": { | |
"name": "agieval:gaokao-english", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-english", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 306, | |
"effective_num_docs": 306, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-geography": { | |
"name": "agieval:gaokao-geography", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-geography", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 199, | |
"effective_num_docs": 199, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-history": { | |
"name": "agieval:gaokao-history", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-history", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 235, | |
"effective_num_docs": 235, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-mathqa": { | |
"name": "agieval:gaokao-mathqa", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-mathqa", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 351, | |
"effective_num_docs": 351, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:gaokao-physics": { | |
"name": "agieval:gaokao-physics", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-gaokao-physics", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 200, | |
"effective_num_docs": 200, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:logiqa-en": { | |
"name": "agieval:logiqa-en", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-logiqa-en", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 651, | |
"effective_num_docs": 651, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:logiqa-zh": { | |
"name": "agieval:logiqa-zh", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-logiqa-zh", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 651, | |
"effective_num_docs": 651, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:lsat-ar": { | |
"name": "agieval:lsat-ar", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-lsat-ar", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 230, | |
"effective_num_docs": 230, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:lsat-lr": { | |
"name": "agieval:lsat-lr", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-lsat-lr", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 510, | |
"effective_num_docs": 510, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:lsat-rc": { | |
"name": "agieval:lsat-rc", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-lsat-rc", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 269, | |
"effective_num_docs": 269, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:sat-en": { | |
"name": "agieval:sat-en", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-sat-en", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 206, | |
"effective_num_docs": 206, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:sat-en-without-passage": { | |
"name": "agieval:sat-en-without-passage", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-sat-en-without-passage", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 206, | |
"effective_num_docs": 206, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
}, | |
"lighteval|agieval:sat-math": { | |
"name": "agieval:sat-math", | |
"prompt_function": "agieval", | |
"hf_repo": "dmayhem93/agieval-sat-math", | |
"hf_subset": "default", | |
"metric": [ | |
"loglikelihood_acc", | |
"loglikelihood_acc_norm_nospace" | |
], | |
"hf_avail_splits": [ | |
"test" | |
], | |
"evaluation_splits": [ | |
"test" | |
], | |
"few_shots_split": null, | |
"few_shots_select": "random_sampling", | |
"generation_size": 1, | |
"stop_sequence": null, | |
"output_regex": null, | |
"frozen": false, | |
"suite": [ | |
"lighteval" | |
], | |
"original_num_docs": 220, | |
"effective_num_docs": 220, | |
"trust_dataset": true, | |
"must_remove_duplicate_docs": null | |
} | |
}, | |
"summary_tasks": { | |
"lighteval|agieval:aqua-rat|0": { | |
"hashes": { | |
"hash_examples": "f09607f69e5b7525", | |
"hash_full_prompts": "8b913655a6fea4ab", | |
"hash_input_tokens": "293a61e5163aa27e", | |
"hash_cont_tokens": "a12c4ac8996ba11d" | |
}, | |
"truncated": 0, | |
"non_truncated": 254, | |
"padded": 1270, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-biology|0": { | |
"hashes": { | |
"hash_examples": "f262eaf4a72db963", | |
"hash_full_prompts": "c7078ace868f7ee8", | |
"hash_input_tokens": "4abbc3b98bca27c3", | |
"hash_cont_tokens": "22b786cf7aa6d1a9" | |
}, | |
"truncated": 0, | |
"non_truncated": 210, | |
"padded": 840, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-chemistry|0": { | |
"hashes": { | |
"hash_examples": "47f2e649f58d9da5", | |
"hash_full_prompts": "bd066d6d8c807f39", | |
"hash_input_tokens": "3d9728247bf04ac3", | |
"hash_cont_tokens": "318562bcb4103fc4" | |
}, | |
"truncated": 0, | |
"non_truncated": 207, | |
"padded": 831, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-chinese|0": { | |
"hashes": { | |
"hash_examples": "1010b21fde4726ab", | |
"hash_full_prompts": "3f53e9dd34c43d52", | |
"hash_input_tokens": "f2a1d4c848527f86", | |
"hash_cont_tokens": "7b177add04591cdb" | |
}, | |
"truncated": 0, | |
"non_truncated": 246, | |
"padded": 982, | |
"non_padded": 2, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-english|0": { | |
"hashes": { | |
"hash_examples": "4864e492a350ae93", | |
"hash_full_prompts": "59104cb8623f69e5", | |
"hash_input_tokens": "ff82bcaabb6cde43", | |
"hash_cont_tokens": "c9ca0addab2a9327" | |
}, | |
"truncated": 0, | |
"non_truncated": 306, | |
"padded": 1224, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-geography|0": { | |
"hashes": { | |
"hash_examples": "ec3a021e37650e7d", | |
"hash_full_prompts": "d2456e0377df1973", | |
"hash_input_tokens": "b6593d42d60f9e65", | |
"hash_cont_tokens": "e1bc87e81807da78" | |
}, | |
"truncated": 0, | |
"non_truncated": 199, | |
"padded": 796, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-history|0": { | |
"hashes": { | |
"hash_examples": "b3fad1596f1ae1f9", | |
"hash_full_prompts": "faea8f291d9a0cd5", | |
"hash_input_tokens": "ffa388f05b4bfadf", | |
"hash_cont_tokens": "b3c6c60f59b08db4" | |
}, | |
"truncated": 0, | |
"non_truncated": 235, | |
"padded": 940, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-mathqa|0": { | |
"hashes": { | |
"hash_examples": "1d1088556861b0b0", | |
"hash_full_prompts": "de899bfeaaa61154", | |
"hash_input_tokens": "1e826e8ae60c9cf4", | |
"hash_cont_tokens": "5d69ebf8391bf298" | |
}, | |
"truncated": 0, | |
"non_truncated": 351, | |
"padded": 1404, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:gaokao-physics|0": { | |
"hashes": { | |
"hash_examples": "eb05f035c7bfca2f", | |
"hash_full_prompts": "08008e0300283edc", | |
"hash_input_tokens": "0a4cddbeea3c31fa", | |
"hash_cont_tokens": "93b4c52fa838ace2" | |
}, | |
"truncated": 0, | |
"non_truncated": 200, | |
"padded": 800, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:logiqa-en|0": { | |
"hashes": { | |
"hash_examples": "0a688a45f69c21e0", | |
"hash_full_prompts": "3405fd262d4b2d28", | |
"hash_input_tokens": "8672a68d080ba1fb", | |
"hash_cont_tokens": "2624c1243afac3f2" | |
}, | |
"truncated": 0, | |
"non_truncated": 651, | |
"padded": 2604, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:logiqa-zh|0": { | |
"hashes": { | |
"hash_examples": "620d6888b6012ea5", | |
"hash_full_prompts": "ac19dc4eaa56f5e0", | |
"hash_input_tokens": "24c3bd34df395ee2", | |
"hash_cont_tokens": "725ca2b921b6f8fe" | |
}, | |
"truncated": 0, | |
"non_truncated": 651, | |
"padded": 2603, | |
"non_padded": 1, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:lsat-ar|0": { | |
"hashes": { | |
"hash_examples": "627c8f5ccd5da209", | |
"hash_full_prompts": "9aed992c4bfa8dd7", | |
"hash_input_tokens": "042173fcbbb85776", | |
"hash_cont_tokens": "23c097e1d431f2b8" | |
}, | |
"truncated": 0, | |
"non_truncated": 230, | |
"padded": 1137, | |
"non_padded": 13, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:lsat-lr|0": { | |
"hashes": { | |
"hash_examples": "794641c86de172f5", | |
"hash_full_prompts": "6a36e90325996129", | |
"hash_input_tokens": "58c268b8b0f3c5c4", | |
"hash_cont_tokens": "b555f4319746d815" | |
}, | |
"truncated": 0, | |
"non_truncated": 510, | |
"padded": 2532, | |
"non_padded": 18, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:lsat-rc|0": { | |
"hashes": { | |
"hash_examples": "35981ed917ea01cf", | |
"hash_full_prompts": "15f0f342f9572c41", | |
"hash_input_tokens": "1eaa7331cf1a555e", | |
"hash_cont_tokens": "8c1c4fc8c9cabd97" | |
}, | |
"truncated": 0, | |
"non_truncated": 269, | |
"padded": 1345, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:sat-en|0": { | |
"hashes": { | |
"hash_examples": "041c39c646536a1e", | |
"hash_full_prompts": "163217fd603b9352", | |
"hash_input_tokens": "e1e9417c28fa0db0", | |
"hash_cont_tokens": "4837f17aae6c95e0" | |
}, | |
"truncated": 0, | |
"non_truncated": 206, | |
"padded": 821, | |
"non_padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:sat-en-without-passage|0": { | |
"hashes": { | |
"hash_examples": "e4d9284367dff68f", | |
"hash_full_prompts": "bdd4c7065b87de8a", | |
"hash_input_tokens": "70bcab139ba874e5", | |
"hash_cont_tokens": "4837f17aae6c95e0" | |
}, | |
"truncated": 0, | |
"non_truncated": 206, | |
"padded": 817, | |
"non_padded": 4, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"lighteval|agieval:sat-math|0": { | |
"hashes": { | |
"hash_examples": "01db7291603fc1a0", | |
"hash_full_prompts": "63ca65b2f0baebb5", | |
"hash_input_tokens": "e242b0a44e256fbb", | |
"hash_cont_tokens": "d959ef83452da9fe" | |
}, | |
"truncated": 0, | |
"non_truncated": 220, | |
"padded": 877, | |
"non_padded": 3, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "da3af66181f18ddf", | |
"hash_full_prompts": "f7c298d03686fa0e", | |
"hash_input_tokens": "84e02a40c0a714da", | |
"hash_cont_tokens": "b3bace8c3199f6d8" | |
}, | |
"truncated": 0, | |
"non_truncated": 5151, | |
"padded": 21823, | |
"non_padded": 41, | |
"num_truncated_few_shots": 0 | |
} | |
} |