open-r1-eval-leaderboard
/
eval_results
/UCLA-AGI
/zephyr-7b-sft-full-SPIN-iter1
/main
/eval_gsm8k.json
{ | |
"results": { | |
"gsm8k": { | |
"exact_match,get-answer": 0.35784685367702807, | |
"exact_match_stderr,get-answer": 0.013204142536119947, | |
"alias": "gsm8k" | |
} | |
}, | |
"configs": { | |
"gsm8k": { | |
"task": "gsm8k", | |
"group": [ | |
"math_word_problems" | |
], | |
"dataset_path": "gsm8k", | |
"dataset_name": "main", | |
"training_split": "train", | |
"test_split": "test", | |
"fewshot_split": "train", | |
"doc_to_text": "Question: {{question}}\nAnswer:", | |
"doc_to_target": "{{answer}}", | |
"description": "", | |
"target_delimiter": " ", | |
"fewshot_delimiter": "\n\n", | |
"num_fewshot": 5, | |
"metric_list": [ | |
{ | |
"metric": "exact_match", | |
"aggregation": "mean", | |
"higher_is_better": true, | |
"ignore_case": true, | |
"ignore_punctuation": false, | |
"regexes_to_ignore": [ | |
",", | |
"\\$", | |
"(?s).*#### " | |
] | |
} | |
], | |
"output_type": "generate_until", | |
"generation_kwargs": { | |
"until": [ | |
"\n\n", | |
"Question:" | |
], | |
"do_sample": false, | |
"temperature": 0.0 | |
}, | |
"repeats": 1, | |
"filter_list": [ | |
{ | |
"name": "get-answer", | |
"filter": [ | |
{ | |
"function": "regex", | |
"regex_pattern": "#### (\\-?[0-9\\.\\,]+)" | |
}, | |
{ | |
"function": "take_first" | |
} | |
] | |
} | |
], | |
"should_decontaminate": false, | |
"metadata": { | |
"version": 2.0 | |
} | |
} | |
}, | |
"versions": { | |
"gsm8k": 2.0 | |
}, | |
"n-shot": { | |
"gsm8k": 5 | |
}, | |
"config": { | |
"model": "hf", | |
"model_args": "pretrained=UCLA-AGI/zephyr-7b-sft-full-SPIN-iter1,revision=main,dtype=bfloat16", | |
"batch_size": "auto", | |
"batch_sizes": [], | |
"device": null, | |
"use_cache": null, | |
"limit": null, | |
"bootstrap_iters": 100000, | |
"gen_kwargs": null | |
}, | |
"git_hash": "0acdfc3" | |
} |