File size: 4,550 Bytes
55f4d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
import subprocess
import sys
import os
import threading
import time

class Logger:
    def __init__(self, filename):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
        self.log.flush()

    def flush(self):
        self.terminal.flush()
        self.log.flush()

    def isatty(self):
        return False

log_file = "bigcodebench_output.log"
sys.stdout = Logger(log_file)

default_command = "bigcodebench.evaluate"
is_running = False

def generate_command(
    jsonl_file, split, subset, save_pass_rate, parallel,
    min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
    check_gt_only, no_gt
):
    command = [default_command]
    
    if jsonl_file is not None:
        samples = os.path.basename(jsonl_file.name)
        command.extend(["--samples", samples])
    
    command.extend(["--split", split, "--subset", subset])
    
    if save_pass_rate:
        command.append("--save_pass_rate")
    
    if parallel is not None and parallel != 0:
        command.extend(["--parallel", str(int(parallel))])
    
    command.extend([
        "--min-time-limit", str(min_time_limit),
        "--max-as-limit", str(int(max_as_limit)),
        "--max-data-limit", str(int(max_data_limit)),
        "--max-stack-limit", str(int(max_stack_limit))
    ])
    
    if check_gt_only:
        command.append("--check-gt-only")
    
    if no_gt:
        command.append("--no-gt")
    
    return " ".join(command)

def run_bigcodebench(command):
    global is_running
    is_running = True
    print(f"Executing command: {command}")
    
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
    
    for line in process.stdout:
        print(line, end='')
    
    process.wait()
    
    if process.returncode != 0:
        print(f"Error: Command exited with status {process.returncode}")
    
    cleanup_command = "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"
    subprocess.run(cleanup_command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
    is_running = False

def read_logs():
    with open(log_file, "r") as f:
        return f.read()

with gr.Blocks() as demo:
    gr.Markdown("# BigCodeBench Evaluation App")
    
    with gr.Row():
        jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
        split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
        subset = gr.Dropdown(choices=["full", "hard"], label="Subset", value="full")
    
    with gr.Row():
        save_pass_rate = gr.Checkbox(label="Save Pass Rate")
        parallel = gr.Number(label="Parallel (optional)", precision=0)
        min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
        max_as_limit = gr.Number(label="Max AS Limit", value=128*1024, precision=0)
    
    with gr.Row():
        max_data_limit = gr.Number(label="Max Data Limit", value=4*1024, precision=0)
        max_stack_limit = gr.Number(label="Max Stack Limit", value=5, precision=0)
        check_gt_only = gr.Checkbox(label="Check GT Only")
        no_gt = gr.Checkbox(label="No GT")
    
    command_output = gr.Textbox(label="Command", lines=2, value=default_command, interactive=False)
    submit_btn = gr.Button("Run Evaluation")
    log_output = gr.Textbox(label="Execution Logs", lines=10)
    
    def update_command(*args):
        return generate_command(*args)
    
    input_components = [
        jsonl_file, split, subset, save_pass_rate, parallel,
        min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
        check_gt_only, no_gt
    ]
    
    for component in input_components:
        component.change(update_command, inputs=input_components, outputs=command_output)
    
    def on_submit(command):
        global is_running
        if is_running:
            return "A command is already running. Please wait for it to finish."
        
        def run_and_update():
            run_bigcodebench(command)
            return read_logs()
        
        return gr.update(value="Evaluation started. Please wait for the logs to update..."), gr.update(value=run_and_update)
    
    submit_btn.click(on_submit, inputs=[command_output], outputs=[log_output, log_output])

if __name__ == "__main__":
    demo.queue().launch()