File size: 4,148 Bytes
55f4d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45b26c3
 
 
 
55f4d70
45b26c3
 
 
 
 
 
55f4d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3459dd9
8624024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55f4d70
8624024
 
55f4d70
8624024
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import subprocess
import sys
import os
import threading

class Logger:
    def __init__(self, filename):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
        self.log.flush()

    def flush(self):
        self.terminal.flush()
        self.log.flush()

    def isatty(self):
        return False

log_file = "bigcodebench_output.log"
sys.stdout = Logger(log_file)

def generate_command(
    jsonl_file, split, subset, save_pass_rate, parallel,
    min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
    check_gt_only, no_gt
):
    if jsonl_file is None:
        return "Please upload a JSONL file"

    samples = os.path.basename(jsonl_file.name)
    
    command = [
        "bigcodebench.evaluate",
        "--split", split,
        "--subset", subset,
        "--samples", samples
    ]
    
    if save_pass_rate:
        command.append("--save_pass_rate")
    
    if parallel is not None and parallel != 0:
        command.extend(["--parallel", str(int(parallel))])
    
    command.extend([
        "--min-time-limit", str(min_time_limit),
        "--max-as-limit", str(int(max_as_limit)),
        "--max-data-limit", str(int(max_data_limit)),
        "--max-stack-limit", str(int(max_stack_limit))
    ])
    
    if check_gt_only:
        command.append("--check-gt-only")
    
    if no_gt:
        command.append("--no-gt")
    
    return " ".join(command)

def run_bigcodebench(command):
    print(f"Executing command: {command}")
    
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
    
    for line in process.stdout:
        print(line, end='')
    
    process.wait()
    
    if process.returncode != 0:
        print(f"Error: Command exited with status {process.returncode}")

def read_logs():
    with open(log_file, "r") as f:
        return f.read()

def run():
    with gr.Blocks() as demo:
        gr.Markdown("# BigCodeBench Evaluation App")
        
        with gr.Row():
            jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
            split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
            subset = gr.Dropdown(choices=["full", "hard"], label="Subset", value="full")
        
        with gr.Row():
            save_pass_rate = gr.Checkbox(label="Save Pass Rate")
            parallel = gr.Number(label="Parallel (optional)", precision=0)
            min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
            max_as_limit = gr.Number(label="Max AS Limit", value=128*1024, precision=0)
        
        with gr.Row():
            max_data_limit = gr.Number(label="Max Data Limit", value=4*1024, precision=0)
            max_stack_limit = gr.Number(label="Max Stack Limit", value=5, precision=0)
            check_gt_only = gr.Checkbox(label="Check GT Only")
            no_gt = gr.Checkbox(label="No GT")
        
        command_output = gr.Textbox(label="Command", lines=2)
        submit_btn = gr.Button("Run Evaluation")
        log_output = gr.Textbox(label="Execution Logs", lines=10)
        
        def update_command(*args):
            return generate_command(*args)
        
        input_components = [
            jsonl_file, split, subset, save_pass_rate, parallel,
            min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
            check_gt_only, no_gt
        ]
        
        for component in input_components:
            component.change(update_command, inputs=input_components, outputs=command_output)
        
        def on_submit(command):
            threading.Thread(target=run_bigcodebench, args=(command,), daemon=True).start()
            return "Evaluation started. Please wait for the logs to update..."
        
        submit_btn.click(on_submit, inputs=[command_output], outputs=[log_output])
        
        demo.load(read_logs, None, log_output, every=1)

    demo.queue().launch()
    
if __name__ == "__main__":
    run()