Spaces:
Runtime error
Runtime error
"""This file checks two things: | |
1. Is the LLMs codegen completed for each benchmark? | |
2. Warn the code that are not compilable (it could be some impl issues). | |
""" | |
from termcolor import colored | |
from evalplus.data import load_solutions | |
from evalplus.sanitize import syntax_check | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--samples", type=str, required=True) | |
parser.add_argument( | |
"--dataset", required=True, type=str, choices=["humaneval", "mbpp"] | |
) | |
parser.add_argument("--nsample", type=int, default=1) | |
parser.add_argument("--verbose", action="store_true") | |
args = parser.parse_args() | |
# List[Dict{"task_id", "solution"}] | |
solutions = load_solutions(args.samples) | |
if args.dataset == "humaneval": | |
from evalplus.data import get_human_eval_plus | |
dataset = get_human_eval_plus() | |
dataset_name = "HumanEval" | |
elif args.dataset == "mbpp": | |
from evalplus.data import get_mbpp_plus | |
dataset = get_mbpp_plus() | |
dataset_name = "Mbpp" | |
id2solutions = {} | |
for solution in solutions: | |
task_id = solution["task_id"] | |
if task_id not in id2solutions: | |
id2solutions[task_id] = [] | |
if "solution" not in solution: | |
assert "completion" in solution, "solution or completion must exist!" | |
solution["solution"] = dataset[task_id]["prompt"] + solution["completion"] | |
id2solutions[task_id].append(solution) | |
nsample = max(args.nsample, max(len(v) for v in id2solutions.values())) | |
print(colored("==============================", "blue")) | |
print(colored(" ::: Checking completeness... ", "blue")) | |
print(colored(" ::::: All tasks complete? ", "blue")) | |
ndone = 0 | |
task_ids = dataset.keys() | |
ntask = len(task_ids) | |
for task_id in task_ids: | |
if task_id not in id2solutions: | |
print(colored(f" ⚠️ {task_id} is missing!", "red")) | |
continue | |
nfiles = len(id2solutions[task_id]) | |
if nfiles == nsample: | |
ndone += 1 | |
continue | |
print( | |
colored( | |
f" ⚠️ {task_id} only has {nfiles} samples! But {nsample} are expected.", | |
"red", | |
) | |
) | |
if ntask != ndone: | |
ntbd = ntask - ndone | |
print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red")) | |
else: | |
print(colored(f" ::::: All {ntask} tasks complete!", "green")) | |
print(colored("==============================", "blue")) | |
print(colored(" ::: Checking compilation... ", "blue")) | |
print(colored(" ::::: All code compilable? ", "blue")) | |
ncode = 0 | |
nwrong = 0 | |
for task_id in task_ids: | |
# task_id must exist | |
if task_id not in id2solutions: | |
continue | |
for solution in id2solutions[task_id]: | |
ncode += 1 | |
code = solution["solution"] | |
dbg_identifier = solution["_identifier"] | |
if code.strip() == "": | |
print(colored(f" ⚠️ {dbg_identifier} is empty!", "red")) | |
nwrong += 1 | |
elif not syntax_check(code, args.verbose): | |
print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red")) | |
nwrong += 1 | |
if 0 != nwrong: | |
print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red")) | |
else: | |
print(colored(f" ::::: All {ncode} code are compilable!", "green")) | |