File size: 4,562 Bytes
55f4d70
 
 
 
 
828190b
55f4d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828190b
 
 
55f4d70
 
 
 
 
828190b
55f4d70
828190b
 
 
 
 
55f4d70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
828190b
 
55f4d70
 
 
 
 
 
 
 
 
 
 
828190b
 
 
 
 
55f4d70
 
 
 
 
828190b
02d3620
828190b
 
 
 
 
 
 
 
 
 
 
8624024
828190b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55f4d70
864d324
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import gradio as gr
import subprocess
import sys
import os
import threading
import time

class Logger:
    def __init__(self, filename):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)
        self.log.flush()

    def flush(self):
        self.terminal.flush()
        self.log.flush()

    def isatty(self):
        return False

log_file = "bigcodebench_output.log"
sys.stdout = Logger(log_file)

default_command = "bigcodebench.evaluate"
is_running = False

def generate_command(
    jsonl_file, split, subset, save_pass_rate, parallel,
    min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
    check_gt_only, no_gt
):
    command = [default_command]
    
    if jsonl_file is not None:
        samples = os.path.basename(jsonl_file.name)
        command.extend(["--samples", samples])
    
    command.extend(["--split", split, "--subset", subset])
    
    if save_pass_rate:
        command.append("--save_pass_rate")
    
    if parallel is not None and parallel != 0:
        command.extend(["--parallel", str(int(parallel))])
    
    command.extend([
        "--min-time-limit", str(min_time_limit),
        "--max-as-limit", str(int(max_as_limit)),
        "--max-data-limit", str(int(max_data_limit)),
        "--max-stack-limit", str(int(max_stack_limit))
    ])
    
    if check_gt_only:
        command.append("--check-gt-only")
    
    if no_gt:
        command.append("--no-gt")
    
    return " ".join(command)

def run_bigcodebench(command):
    global is_running
    is_running = True
    print(f"Executing command: {command}")
    
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
    
    for line in process.stdout:
        print(line, end='')
    
    process.wait()
    
    if process.returncode != 0:
        print(f"Error: Command exited with status {process.returncode}")
    
    cleanup_command = "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"
    subprocess.run(cleanup_command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    
    is_running = False

def read_logs():
    with open(log_file, "r") as f:
        return f.read()

with gr.Blocks() as demo:
    gr.Markdown("# BigCodeBench Evaluator")
    
    with gr.Row():
        jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
        split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
        subset = gr.Dropdown(choices=["full", "hard"], label="Subset", value="full")
    
    with gr.Row():
        save_pass_rate = gr.Checkbox(label="Save Pass Rate")
        parallel = gr.Number(label="Parallel (optional)", precision=0)
        min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
        max_as_limit = gr.Number(label="Max AS Limit", value=128*1024, precision=0)
    
    with gr.Row():
        max_data_limit = gr.Number(label="Max Data Limit", value=4*1024, precision=0)
        max_stack_limit = gr.Number(label="Max Stack Limit", value=5, precision=0)
        check_gt_only = gr.Checkbox(label="Check GT Only")
        no_gt = gr.Checkbox(label="No GT")
    
    command_output = gr.Textbox(label="Command", lines=2, value=default_command, interactive=False)
    submit_btn = gr.Button("Run Evaluation")
    log_output = gr.Textbox(label="Execution Logs", lines=10)
    
    def update_command(*args):
        return generate_command(*args)
    
    input_components = [
        jsonl_file, split, subset, save_pass_rate, parallel,
        min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
        check_gt_only, no_gt
    ]
    
    for component in input_components:
        component.change(update_command, inputs=input_components, outputs=command_output)
    
    def on_submit(command):
        global is_running
        if is_running:
            return "A command is already running. Please wait for it to finish."
        threading.Thread(target=run_bigcodebench, args=(command,), daemon=True).start()
        return "Evaluation started. Please wait for the logs to update..."
    
    submit_btn.click(on_submit, inputs=[command_output], outputs=[log_output])
    
    demo.load(read_logs, None, log_output, every=1)

if __name__ == "__main__":
    demo.queue(max_size=300).launch(server_name="0.0.0.0", server_port=7860)