Spaces:

jjyang7
/

oe-eval-bcb-lite-evaluator

Sleeping

oe-eval-bcb-lite-evaluator / local_evaluator.py

jjyang77

add local_evaluator and some cleanup

bb636ca 7 months ago

3.05 kB

	import json
	import asyncio
	import argparse
	import httpx
	from typing import List, Optional

	_parser = argparse.ArgumentParser()

	_parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl")
	_parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval")


	def load_jsonl(filename):
	with open(filename, "r") as file:
	return [json.loads(line.strip()) for line in file]

	def save_jsonl(filename, data):
	with open(filename, "w") as file:
	for d in data:
	file.write(json.dumps(d))
	file.write("\n")
	return filename

	async def call_oe_eval_bcb_client(
	samples_data: List[dict],
	calibrate: bool = True,
	parallel: int = -1,
	min_time_limit: float = 1,
	max_as_limit: int = 30 * 1024,
	max_data_limit: int = 30 * 1024,
	max_stack_limit: int = 10,
	no_gt: bool = True,
	execute_api: Optional[str] = None,
	) -> List[dict]:
	"""
	OE-Eval BigCodeBench remote code execution API
	"""
	if execute_api is None:
	execute_api = "http://localhost:9000/evaluate/"

	async with httpx.AsyncClient() as client:
	params = {
	"calibrate": calibrate,
	"parallel": parallel,
	"min_time_limit": min_time_limit,
	"max_as_limit": max_as_limit,
	"max_data_limit": max_data_limit,
	"max_stack_limit": max_stack_limit,
	"no_gt": no_gt,
	}
	# Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of
	# generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec)
	total_timeout = 900
	response = await client.post(
	execute_api, json=samples_data, params=params, timeout=total_timeout
	)
	results = response.json()

	print("Results received from remote API. Processing ...")
	check_results = []
	for doc in results["eval"].values():
	for rep in doc:
	rep["tested_completion"] = rep.pop("solution")
	rep["passed"] = rep.pop("status") == "pass"
	rep["exec_result"] = rep.pop("details")
	check_results.append(rep)
	if check_results:
	pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results)
	return check_results, pass_at_1
	else:
	return None, None

	def evaluate(sample_file, execute_api: Optional[str] = None):
	batched_code_test = load_jsonl(sample_file)
	results, pass_at_1 = asyncio.run(
	call_oe_eval_bcb_client(
	samples_data=batched_code_test,
	calibrate=True,
	parallel=-1,
	min_time_limit=30,
	execute_api = execute_api
	)
	)
	print("pass@1:", pass_at_1)
	return results

	def main():
	args = _parser.parse_args()
	args_dict = vars(args)
	results = evaluate(args_dict["filename"], args_dict["remoteapi"])
	save_jsonl("data/eval_results.jsonl", results)

	if __name__ == "__main__":
	main()