Spaces:
Build error
Build error
Jialin Song
commited on
Commit
·
9d03b23
1
Parent(s):
cbe9336
update apps_metric to provide outputs
Browse files- testing_util.py +53 -16
- utils.py +1 -1
testing_util.py
CHANGED
|
@@ -54,7 +54,8 @@ def run_test(sample, test=None, debug=False):
|
|
| 54 |
otherwise it'll just return an input and output pair.
|
| 55 |
"""
|
| 56 |
# Disable functionalities that can make destructive changes to the test.
|
| 57 |
-
|
|
|
|
| 58 |
|
| 59 |
if debug:
|
| 60 |
print(f"start = {datetime.now().time()}")
|
|
@@ -99,7 +100,7 @@ def run_test(sample, test=None, debug=False):
|
|
| 99 |
if debug:
|
| 100 |
print(f"type 0 compilation error = {e}")
|
| 101 |
results.append(-2)
|
| 102 |
-
return results
|
| 103 |
signal.alarm(0)
|
| 104 |
|
| 105 |
elif which_type == CODE_TYPE.standard_input:
|
|
@@ -156,6 +157,7 @@ def run_test(sample, test=None, debug=False):
|
|
| 156 |
results.append(-2)
|
| 157 |
return results
|
| 158 |
|
|
|
|
| 159 |
for index, inputs in enumerate(in_outs["inputs"]):
|
| 160 |
# JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
|
| 161 |
try:
|
|
@@ -200,6 +202,15 @@ def run_test(sample, test=None, debug=False):
|
|
| 200 |
|
| 201 |
# reset the alarm
|
| 202 |
signal.alarm(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
except Exception as e:
|
| 204 |
signal.alarm(0)
|
| 205 |
faulthandler.disable()
|
|
@@ -234,6 +245,10 @@ def run_test(sample, test=None, debug=False):
|
|
| 234 |
results.append(-1)
|
| 235 |
signal.alarm(0)
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
if not passed:
|
| 238 |
if debug:
|
| 239 |
nl = "\n"
|
|
@@ -246,7 +261,12 @@ def run_test(sample, test=None, debug=False):
|
|
| 246 |
if passed and debug:
|
| 247 |
print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
|
| 248 |
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
tmp_result = True
|
| 251 |
results.append(tmp_result)
|
| 252 |
continue
|
|
@@ -391,26 +411,42 @@ def run_test(sample, test=None, debug=False):
|
|
| 391 |
if not isinstance(inputs, list):
|
| 392 |
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
| 393 |
else:
|
| 394 |
-
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
| 395 |
-
|
| 396 |
|
| 397 |
-
return results
|
| 398 |
|
| 399 |
|
| 400 |
def custom_compare_(output, ground_truth):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
if isinstance(output, list):
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
-
return False
|
| 414 |
|
| 415 |
def stripped_string_compare(s1, s2):
|
| 416 |
s1 = s1.lstrip().rstrip()
|
|
@@ -427,6 +463,7 @@ def call_method(method, inputs):
|
|
| 427 |
# sys.setrecursionlimit(10000)
|
| 428 |
|
| 429 |
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
|
|
|
| 430 |
@patch('builtins.open', mock_open(read_data=inputs))
|
| 431 |
@patch('sys.stdin', StringIO(inputs))
|
| 432 |
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
|
@@ -522,4 +559,4 @@ def reliability_guard(maximum_memory_bytes=None):
|
|
| 522 |
sys.modules["joblib"] = None
|
| 523 |
sys.modules["resource"] = None
|
| 524 |
sys.modules["psutil"] = None
|
| 525 |
-
sys.modules["tkinter"] = None
|
|
|
|
| 54 |
otherwise it'll just return an input and output pair.
|
| 55 |
"""
|
| 56 |
# Disable functionalities that can make destructive changes to the test.
|
| 57 |
+
# TODO: disable for now as it interferes with GPT-4 generation through gateway
|
| 58 |
+
# reliability_guard()
|
| 59 |
|
| 60 |
if debug:
|
| 61 |
print(f"start = {datetime.now().time()}")
|
|
|
|
| 100 |
if debug:
|
| 101 |
print(f"type 0 compilation error = {e}")
|
| 102 |
results.append(-2)
|
| 103 |
+
return results, {}
|
| 104 |
signal.alarm(0)
|
| 105 |
|
| 106 |
elif which_type == CODE_TYPE.standard_input:
|
|
|
|
| 157 |
results.append(-2)
|
| 158 |
return results
|
| 159 |
|
| 160 |
+
program_outputs = {}
|
| 161 |
for index, inputs in enumerate(in_outs["inputs"]):
|
| 162 |
# JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
|
| 163 |
try:
|
|
|
|
| 202 |
|
| 203 |
# reset the alarm
|
| 204 |
signal.alarm(0)
|
| 205 |
+
|
| 206 |
+
program_outputs[index] = {
|
| 207 |
+
"pass": tmp_result,
|
| 208 |
+
"pass_pct": int(tmp_result),
|
| 209 |
+
"pass_res": [int(tmp_result)],
|
| 210 |
+
"output": output,
|
| 211 |
+
"input": inputs,
|
| 212 |
+
"ground_truth": in_outs["outputs"][index]
|
| 213 |
+
}
|
| 214 |
except Exception as e:
|
| 215 |
signal.alarm(0)
|
| 216 |
faulthandler.disable()
|
|
|
|
| 245 |
results.append(-1)
|
| 246 |
signal.alarm(0)
|
| 247 |
|
| 248 |
+
program_outputs[index] = {"output": output}
|
| 249 |
+
program_outputs[index]["ground_truth"] = in_outs['outputs'][index]
|
| 250 |
+
program_outputs[index]["input"] = in_outs['inputs'][index]
|
| 251 |
+
|
| 252 |
if not passed:
|
| 253 |
if debug:
|
| 254 |
nl = "\n"
|
|
|
|
| 261 |
if passed and debug:
|
| 262 |
print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
|
| 263 |
|
| 264 |
+
all_pass, pass_pct, pass_res = custom_compare_(output, in_outs['outputs'][index])
|
| 265 |
+
program_outputs[index]["pass"] = all_pass
|
| 266 |
+
program_outputs[index]["pass_pct"] = pass_pct
|
| 267 |
+
program_outputs[index]["pass_res"] = pass_res
|
| 268 |
+
|
| 269 |
+
if all_pass:
|
| 270 |
tmp_result = True
|
| 271 |
results.append(tmp_result)
|
| 272 |
continue
|
|
|
|
| 411 |
if not isinstance(inputs, list):
|
| 412 |
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
| 413 |
else:
|
| 414 |
+
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
|
|
|
|
| 415 |
|
| 416 |
+
return results, program_outputs
|
| 417 |
|
| 418 |
|
| 419 |
def custom_compare_(output, ground_truth):
|
| 420 |
+
|
| 421 |
+
# TODO: split ground_truth and compare one by one
|
| 422 |
+
ground_truth_list = ground_truth.strip().split("\n")
|
| 423 |
+
correct = 0
|
| 424 |
+
res = []
|
| 425 |
|
| 426 |
if isinstance(output, list):
|
| 427 |
+
for out, g_t in zip(output, ground_truth_list):
|
| 428 |
+
if out.strip() == g_t.strip():
|
| 429 |
+
correct += 1
|
| 430 |
+
res.append(1)
|
| 431 |
+
else:
|
| 432 |
+
res.append(0)
|
| 433 |
|
| 434 |
+
return correct == len(ground_truth_list), correct / len(ground_truth_list), res
|
| 435 |
+
|
| 436 |
+
return False, 0.0, []
|
| 437 |
+
|
| 438 |
+
# if isinstance(output, list):
|
| 439 |
+
# output_1 = "\n".join(output)
|
| 440 |
+
# if stripped_string_compare(output_1, ground_truth):
|
| 441 |
+
# return True
|
| 442 |
+
|
| 443 |
+
# if isinstance(output, list):
|
| 444 |
+
# output_2 = [o.lstrip().rstrip() for o in output]
|
| 445 |
+
# output_2 = "\n".join(output_2)
|
| 446 |
+
# if stripped_string_compare(output_2, ground_truth):
|
| 447 |
+
# return True
|
| 448 |
|
| 449 |
+
# return False
|
| 450 |
|
| 451 |
def stripped_string_compare(s1, s2):
|
| 452 |
s1 = s1.lstrip().rstrip()
|
|
|
|
| 463 |
# sys.setrecursionlimit(10000)
|
| 464 |
|
| 465 |
# @patch('builtins.input', side_effect=inputs.split("\n"))
|
| 466 |
+
@patch('builtins.input', lambda *args: next(inputs_line_iterator))
|
| 467 |
@patch('builtins.open', mock_open(read_data=inputs))
|
| 468 |
@patch('sys.stdin', StringIO(inputs))
|
| 469 |
@patch('sys.stdin.readline', lambda *args: next(inputs_line_iterator))
|
|
|
|
| 559 |
sys.modules["joblib"] = None
|
| 560 |
sys.modules["resource"] = None
|
| 561 |
sys.modules["psutil"] = None
|
| 562 |
+
sys.modules["tkinter"] = None
|
utils.py
CHANGED
|
@@ -48,7 +48,7 @@ def evaluate_generations(generations: list, indices: list = [], level: str = "al
|
|
| 48 |
"""
|
| 49 |
|
| 50 |
# generations are code generations in the same order of the dataset
|
| 51 |
-
apps_eval = load_dataset(DATASET, split="
|
| 52 |
|
| 53 |
if indices is None:
|
| 54 |
indices = range(len(generations))
|
|
|
|
| 48 |
"""
|
| 49 |
|
| 50 |
# generations are code generations in the same order of the dataset
|
| 51 |
+
apps_eval = load_dataset(DATASET, level, split="train")
|
| 52 |
|
| 53 |
if indices is None:
|
| 54 |
indices = range(len(generations))
|