|
import json |
|
import asyncio |
|
import argparse |
|
import httpx |
|
from typing import List, Optional |
|
|
|
_parser = argparse.ArgumentParser() |
|
|
|
_parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl") |
|
_parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval") |
|
|
|
|
|
def load_jsonl(filename): |
|
with open(filename, "r") as file: |
|
return [json.loads(line.strip()) for line in file] |
|
|
|
def save_jsonl(filename, data): |
|
with open(filename, "w") as file: |
|
for d in data: |
|
file.write(json.dumps(d)) |
|
file.write("\n") |
|
return filename |
|
|
|
async def call_oe_eval_bcb_client( |
|
samples_data: List[dict], |
|
calibrate: bool = True, |
|
parallel: int = -1, |
|
min_time_limit: float = 1, |
|
max_as_limit: int = 30 * 1024, |
|
max_data_limit: int = 30 * 1024, |
|
max_stack_limit: int = 10, |
|
no_gt: bool = True, |
|
execute_api: Optional[str] = None, |
|
) -> List[dict]: |
|
""" |
|
OE-Eval BigCodeBench remote code execution API |
|
""" |
|
if execute_api is None: |
|
execute_api = "http://localhost:9000/evaluate/" |
|
|
|
async with httpx.AsyncClient() as client: |
|
params = { |
|
"calibrate": calibrate, |
|
"parallel": parallel, |
|
"min_time_limit": min_time_limit, |
|
"max_as_limit": max_as_limit, |
|
"max_data_limit": max_data_limit, |
|
"max_stack_limit": max_stack_limit, |
|
"no_gt": no_gt, |
|
} |
|
|
|
|
|
total_timeout = 900 |
|
response = await client.post( |
|
execute_api, json=samples_data, params=params, timeout=total_timeout |
|
) |
|
results = response.json() |
|
|
|
print("Results received from remote API. Processing ...") |
|
check_results = [] |
|
for doc in results["eval"].values(): |
|
for rep in doc: |
|
rep["tested_completion"] = rep.pop("solution") |
|
rep["passed"] = rep.pop("status") == "pass" |
|
rep["exec_result"] = rep.pop("details") |
|
check_results.append(rep) |
|
if check_results: |
|
pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results) |
|
return check_results, pass_at_1 |
|
else: |
|
return None, None |
|
|
|
def evaluate(sample_file, execute_api: Optional[str] = None): |
|
batched_code_test = load_jsonl(sample_file) |
|
results, pass_at_1 = asyncio.run( |
|
call_oe_eval_bcb_client( |
|
samples_data=batched_code_test, |
|
calibrate=True, |
|
parallel=-1, |
|
min_time_limit=30, |
|
execute_api = execute_api |
|
) |
|
) |
|
print("pass@1:", pass_at_1) |
|
return results |
|
|
|
def main(): |
|
args = _parser.parse_args() |
|
args_dict = vars(args) |
|
results = evaluate(args_dict["filename"], args_dict["remoteapi"]) |
|
save_jsonl("data/eval_results.jsonl", results) |
|
|
|
if __name__ == "__main__": |
|
main() |