{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 4, "max_samples": null, "job_id": "", "start_time": 882532.794319516, "end_time": 891148.601742387, "total_evaluation_time_secondes": "8615.807422871003", "model_name": "AI-MO/mistral-7b-sft", "model_sha": "159047b1ab76bbb7c9369ee71bfef1d441fc029e", "model_dtype": "torch.bfloat16", "model_size": "13.99 GB", "config": null }, "results": { "lighteval|math:algebra|0": { "qem": 0.46166807076663857, "qem_stderr": 0.01447596901495004 }, "lighteval|math:counting_and_probability|0": { "qem": 0.18143459915611815, "qem_stderr": 0.017719692309092615 }, "lighteval|math:geometry|0": { "qem": 0.23382045929018788, "qem_stderr": 0.019359430691791527 }, "lighteval|math:intermediate_algebra|0": { "qem": 0.12624584717607973, "qem_stderr": 0.011058593855296428 }, "lighteval|math:number_theory|0": { "qem": 0.16111111111111112, "qem_stderr": 0.015835091780678594 }, "lighteval|math:prealgebra|0": { "qem": 0.46842709529276694, "qem_stderr": 0.01691775751043896 }, "lighteval|math:precalculus|0": { "qem": 0.1575091575091575, "qem_stderr": 0.015604046923319667 }, "lighteval|math:_average|0": { "qem": 0.25574519147172287, "qem_stderr": 0.01585294029793826 }, "all": { "qem": 0.25574519147172287, "qem_stderr": 0.01585294029793826 } }, "versions": { "lighteval|math:algebra|0": 0, "lighteval|math:counting_and_probability|0": 0, "lighteval|math:geometry|0": 0, "lighteval|math:intermediate_algebra|0": 0, "lighteval|math:number_theory|0": 0, "lighteval|math:prealgebra|0": 0, "lighteval|math:precalculus|0": 0 }, "config_tasks": { "lighteval|math:algebra": { "name": "math:algebra", "prompt_function": "math", "hf_repo": "lighteval/MATH", "hf_subset": "algebra", "metric": [ "quasi_exact_match_math" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 2048, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "math" ], "original_num_docs": 1187, "effective_num_docs": 1187, "trust_dataset": true, "must_remove_duplicate_docs": null }, "lighteval|math:counting_and_probability": { "name": "math:counting_and_probability", "prompt_function": "math", "hf_repo": "lighteval/MATH", "hf_subset": "counting_and_probability", "metric": [ "quasi_exact_match_math" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 2048, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "math" ], "original_num_docs": 474, "effective_num_docs": 474, "trust_dataset": true, "must_remove_duplicate_docs": null }, "lighteval|math:geometry": { "name": "math:geometry", "prompt_function": "math", "hf_repo": "lighteval/MATH", "hf_subset": "geometry", "metric": [ "quasi_exact_match_math" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 2048, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "math" ], "original_num_docs": 479, "effective_num_docs": 479, "trust_dataset": true, "must_remove_duplicate_docs": null }, "lighteval|math:intermediate_algebra": { "name": "math:intermediate_algebra", "prompt_function": "math", "hf_repo": "lighteval/MATH", "hf_subset": "intermediate_algebra", "metric": [ "quasi_exact_match_math" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 2048, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "math" ], "original_num_docs": 903, "effective_num_docs": 903, "trust_dataset": true, "must_remove_duplicate_docs": null }, "lighteval|math:number_theory": { "name": "math:number_theory", "prompt_function": "math", "hf_repo": "lighteval/MATH", "hf_subset": "number_theory", "metric": [ "quasi_exact_match_math" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 2048, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "math" ], "original_num_docs": 540, "effective_num_docs": 540, "trust_dataset": true, "must_remove_duplicate_docs": null }, "lighteval|math:prealgebra": { "name": "math:prealgebra", "prompt_function": "math", "hf_repo": "lighteval/MATH", "hf_subset": "prealgebra", "metric": [ "quasi_exact_match_math" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 2048, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "math" ], "original_num_docs": 871, "effective_num_docs": 871, "trust_dataset": true, "must_remove_duplicate_docs": null }, "lighteval|math:precalculus": { "name": "math:precalculus", "prompt_function": "math", "hf_repo": "lighteval/MATH", "hf_subset": "precalculus", "metric": [ "quasi_exact_match_math" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 2048, "stop_sequence": [ "\n" ], "output_regex": null, "frozen": false, "suite": [ "lighteval", "math" ], "original_num_docs": 546, "effective_num_docs": 546, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "lighteval|math:algebra|0": { "hashes": { "hash_examples": "37a2fd2f076d2e49", "hash_full_prompts": "b594f95fc76837ae", "hash_input_tokens": "268f292e08b20496", "hash_cont_tokens": "5ff3b282e103a786" }, "truncated": 1187, "non_truncated": 0, "padded": 186, "non_padded": 1001, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "lighteval|math:counting_and_probability|0": { "hashes": { "hash_examples": "97b4892e28bc078b", "hash_full_prompts": "7248a30b48b9a71a", "hash_input_tokens": "a734e9fc478accef", "hash_cont_tokens": "34e1403ab557ae3f" }, "truncated": 474, "non_truncated": 0, "padded": 42, "non_padded": 432, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "lighteval|math:geometry|0": { "hashes": { "hash_examples": "9e9f0228b8b3d093", "hash_full_prompts": "b0ba9fa4265b2a7f", "hash_input_tokens": "fd28b84db8b56f2b", "hash_cont_tokens": "79a2cb59ecbb222f" }, "truncated": 479, "non_truncated": 0, "padded": 129, "non_padded": 350, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "lighteval|math:intermediate_algebra|0": { "hashes": { "hash_examples": "cfe73a8e28ae94de", "hash_full_prompts": "0519b385dc4d18c5", "hash_input_tokens": "ac07a8b5e8432e70", "hash_cont_tokens": "b36ca367a0db8943" }, "truncated": 900, "non_truncated": 3, "padded": 54, "non_padded": 849, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "lighteval|math:number_theory|0": { "hashes": { "hash_examples": "4ee5237cf144afac", "hash_full_prompts": "e34e2d9e3719c6b3", "hash_input_tokens": "05c6e6efc4dd2f8e", "hash_cont_tokens": "d07397c221f94650" }, "truncated": 540, "non_truncated": 0, "padded": 7, "non_padded": 533, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "lighteval|math:prealgebra|0": { "hashes": { "hash_examples": "3fb3afeb885f73d8", "hash_full_prompts": "dd429103e5accb7a", "hash_input_tokens": "a674b0bd864f5275", "hash_cont_tokens": "fa2fff6b43f88d13" }, "truncated": 871, "non_truncated": 0, "padded": 55, "non_padded": 816, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "lighteval|math:precalculus|0": { "hashes": { "hash_examples": "753e25ab9ec4b46c", "hash_full_prompts": "5437a265a758ad19", "hash_input_tokens": "b75d4bd55f3f5325", "hash_cont_tokens": "8e1e6abecde2e5f0" }, "truncated": 546, "non_truncated": 0, "padded": 24, "non_padded": 522, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "b76099aa9092a203", "hash_full_prompts": "fc120bd09022ad3f", "hash_input_tokens": "c7fcbcc2ee4328b7", "hash_cont_tokens": "152aa86ad3998642" }, "truncated": 4997, "non_truncated": 3, "padded": 497, "non_padded": 4503, "num_truncated_few_shots": 0 } }