import contextlib import faulthandler import functools import io import multiprocessing import os import platform import signal import tempfile from typing import Any # use cache to avoid loading the same file multiple times # which can leads to too many open files error @functools.lru_cache(maxsize=128) def load_file(filepath: str) -> str: with open(filepath, 'r') as f: content = f.read() return content """Check the correctness of a program by running a test suite. Modified from: https://github.com/openai/human-eval/blob/master/human_eval/execution.py """ def unsafe_execute( result: Any, solution_code: str, test_code: str, timeout: float = 10, ): with create_tempdir(): # These system calls are needed when cleaning up tempdir. import os import shutil rmtree = shutil.rmtree rmdir = os.rmdir chdir = os.chdir # Disable functionalities that can make destructive changes to the test. reliability_guard() # Construct the check program and run it. check_program = solution_code + '\n' + test_code try: exec_globals = {} with swallow_io(): with time_limit(timeout): # WARNING # This program exists to execute untrusted model-generated code. Although # it is highly unlikely that model-generated code will do something overtly # malicious in response to this test suite, model-generated code may act # destructively due to a lack of model capability or alignment. # Users are strongly encouraged to sandbox this evaluation suite so that it # does not perform destructive actions on their host or network. For more # information on how OpenAI sandboxes its code, see the accompanying paper. # Once you have read this disclaimer and taken appropriate precautions, # uncomment the following line and proceed at your own risk: exec(check_program, exec_globals) result.append('passed') except TimeoutException: result.append('timed out') except BaseException as e: result.append(f'failed: {e}') # Needed for cleaning up. shutil.rmtree = rmtree os.rmdir = rmdir os.chdir = chdir def check_correctness( solution_code: str, test_code: str, timeout: float = 10, completion_id: int | None = None, ) -> dict: """Evaluates the functional correctness of a completion by running the test suite provided in the problem. :param completion_id: an optional completion ID so we can match the results later even if execution finishes asynchronously. """ manager = multiprocessing.Manager() result = manager.list() p = multiprocessing.Process( target=unsafe_execute, args=(result, solution_code, test_code, timeout) ) p.start() p.join(timeout=timeout + 1) if p.is_alive(): p.kill() if not result: result.append('timed out') return dict( success=result[0] == 'passed', result=result[0], completion_id=completion_id, ) @contextlib.contextmanager def time_limit(seconds: float): def signal_handler(signum, frame): raise TimeoutException('Timed out!') signal.setitimer(signal.ITIMER_REAL, seconds) signal.signal(signal.SIGALRM, signal_handler) try: yield finally: signal.setitimer(signal.ITIMER_REAL, 0) @contextlib.contextmanager def swallow_io(): stream = WriteOnlyStringIO() with contextlib.redirect_stdout(stream): with contextlib.redirect_stderr(stream): with redirect_stdin(stream): yield @contextlib.contextmanager def create_tempdir(): # with tempfile.TemporaryDirectory() as dirname: # Manually do this to avoid too many open files error caused by TemporaryDirectory dirname = tempfile.mkdtemp() with chdir(dirname): yield dirname os.rmdir(dirname) class TimeoutException(Exception): pass class WriteOnlyStringIO(io.StringIO): """StringIO that throws an exception when it's read from""" def read(self, *args, **kwargs): raise IOError def readline(self, *args, **kwargs): raise IOError def readlines(self, *args, **kwargs): raise IOError def readable(self, *args, **kwargs): """Returns True if the IO object can be read.""" return False class redirect_stdin(contextlib._RedirectStream): # type: ignore _stream = 'stdin' @contextlib.contextmanager def chdir(root): if root == '.': yield return cwd = os.getcwd() os.chdir(root) try: yield except BaseException as exc: raise exc finally: os.chdir(cwd) def reliability_guard(maximum_memory_bytes: int | None = None): """This disables various destructive functions and prevents the generated code from interfering with the test (e.g. fork bomb, killing other processes, removing filesystem files, etc.) Warning: This function is NOT a security sandbox. Untrusted code, including, model- generated code, should not be blindly executed outside of one. See the Codex paper for more information about OpenAI's code sandbox, and proceed with caution. """ if maximum_memory_bytes is not None: import resource resource.setrlimit( resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes) ) resource.setrlimit( resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes) ) if not platform.uname().system == 'Darwin': resource.setrlimit( resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes) ) faulthandler.disable() import builtins builtins.exit = None builtins.quit = None import os os.environ['OMP_NUM_THREADS'] = '1' os.kill = None os.system = None os.putenv = None os.remove = None os.removedirs = None os.rmdir = None os.fchdir = None os.setuid = None os.fork = None os.forkpty = None os.killpg = None os.rename = None os.renames = None os.truncate = None os.replace = None os.unlink = None os.fchmod = None os.fchown = None os.chmod = None os.chown = None os.chroot = None os.fchdir = None os.lchflags = None os.lchmod = None os.lchown = None os.getcwd = None os.chdir = None import shutil shutil.rmtree = None shutil.move = None shutil.chown = None import subprocess subprocess.Popen = None # type: ignore __builtins__['help'] = None import sys sys.modules['ipdb'] = None sys.modules['joblib'] = None sys.modules['resource'] = None sys.modules['psutil'] = None sys.modules['tkinter'] = None