Spaces:
Runtime error
Runtime error
File size: 4,221 Bytes
c87c295 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
"""Purpose of this file: Sanitize the code produced by LLMs for the following reasons.
1. Vicuna generated code could miss one white space. We fix the white space to make Vicuna more capable.
2. {Our fault lol.} We find more EOFs tokens afterwards and truncate some messy code afterwards.
"""
import os
from tqdm import tqdm
from evalplus.data import (
get_human_eval_plus,
get_mbpp_plus,
load_solutions,
write_directory,
write_jsonl,
)
from evalplus.sanitize import sanitize
def remove_unindented_lines(code, protect_before, execeptions, trim_tails):
lines = code.splitlines()
cut_idx = []
cut_enabled = False
for i, line in enumerate(lines):
if not cut_enabled and line.startswith(protect_before):
cut_enabled = True
continue
if line.strip() == "":
continue
if any(line.startswith(e) for e in execeptions):
continue
lspace = len(line) - len(line.lstrip())
if lspace == 0:
cut_idx.append(i)
if any(line.rstrip().startswith(t) for t in trim_tails):
# cut off everything behind
cut_idx.extend(list(range(i, len(lines))))
break
return "\n".join([line for i, line in enumerate(lines) if i not in cut_idx])
def to_four_space_indents(old_code):
new_code = ""
for line in old_code.splitlines():
lspace = len(line) - len(line.lstrip())
if lspace == 3:
new_code += " "
new_code += line + "\n"
return new_code
if __name__ == "__main__":
import argparse
import pathlib
parser = argparse.ArgumentParser()
parser.add_argument("--samples", type=str, required=True)
parser.add_argument("--eofs", nargs="+", type=str, default=[])
parser.add_argument("--inplace", action="store_true")
parser.add_argument(
"--rm-prefix-lines", type=str, help="Remove lines starting with this"
)
parser.add_argument(
"--dataset", required=True, type=str, choices=["humaneval", "mbpp"]
)
parser.add_argument(
"--debug-task", type=str, help="Enter the task ID to only sanitize that task."
)
args = parser.parse_args()
# task_id -> entry_point
entry_point = {}
if args.dataset == "humaneval":
dataset = get_human_eval_plus()
elif args.dataset == "mbpp":
dataset = get_mbpp_plus()
for task_id, problem in dataset.items():
entry_point[task_id] = problem["entry_point"]
# make a new folder with "-sanitized" suffix
is_folder = os.path.isdir(args.samples)
target_path = pathlib.Path(args.samples)
if not args.inplace:
if is_folder:
new_name = target_path.name + "-sanitized"
else:
new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
target_path = target_path.parent / new_name
target_path = str(target_path)
nsan = 0
ntotal = 0
new_solutions = []
for solution in tqdm(load_solutions(args.samples)):
task_id = solution["task_id"]
dbg_identifier = solution["_identifier"]
if args.debug_task is not None and task_id != args.debug_task:
continue
ntotal += 1
if "solution" in solution:
old_code = solution["solution"]
else:
assert "completion" in solution
old_code = dataset[task_id]["prompt"] + "\n" + solution["completion"]
old_code = old_code.strip()
new_code = sanitize(
old_code=old_code,
entry_point=entry_point[task_id],
rm_prefix_lines=args.rm_prefix_lines,
eofs=args.eofs,
).strip()
# if changed, print the message
if new_code != old_code:
msg = "Sanitized: " + dbg_identifier
if is_folder:
msg += " -> " + dbg_identifier.replace(args.samples, target_path)
print(msg)
nsan += 1
new_solutions.append({"task_id": task_id, "solution": new_code})
if is_folder:
write_directory(target_path, new_solutions)
else:
write_jsonl(target_path, new_solutions)
print(f"Sanitized {nsan} out of {ntotal} files.")
|