terryyz commited on
Commit
55f4d70
1 Parent(s): 939fbb4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import sys
4
+ import os
5
+ import threading
6
+ import time
7
+
8
+ class Logger:
9
+ def __init__(self, filename):
10
+ self.terminal = sys.stdout
11
+ self.log = open(filename, "w")
12
+
13
+ def write(self, message):
14
+ self.terminal.write(message)
15
+ self.log.write(message)
16
+ self.log.flush()
17
+
18
+ def flush(self):
19
+ self.terminal.flush()
20
+ self.log.flush()
21
+
22
+ def isatty(self):
23
+ return False
24
+
25
+ log_file = "bigcodebench_output.log"
26
+ sys.stdout = Logger(log_file)
27
+
28
+ default_command = "bigcodebench.evaluate"
29
+ is_running = False
30
+
31
+ def generate_command(
32
+ jsonl_file, split, subset, save_pass_rate, parallel,
33
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
34
+ check_gt_only, no_gt
35
+ ):
36
+ command = [default_command]
37
+
38
+ if jsonl_file is not None:
39
+ samples = os.path.basename(jsonl_file.name)
40
+ command.extend(["--samples", samples])
41
+
42
+ command.extend(["--split", split, "--subset", subset])
43
+
44
+ if save_pass_rate:
45
+ command.append("--save_pass_rate")
46
+
47
+ if parallel is not None and parallel != 0:
48
+ command.extend(["--parallel", str(int(parallel))])
49
+
50
+ command.extend([
51
+ "--min-time-limit", str(min_time_limit),
52
+ "--max-as-limit", str(int(max_as_limit)),
53
+ "--max-data-limit", str(int(max_data_limit)),
54
+ "--max-stack-limit", str(int(max_stack_limit))
55
+ ])
56
+
57
+ if check_gt_only:
58
+ command.append("--check-gt-only")
59
+
60
+ if no_gt:
61
+ command.append("--no-gt")
62
+
63
+ return " ".join(command)
64
+
65
+ def run_bigcodebench(command):
66
+ global is_running
67
+ is_running = True
68
+ print(f"Executing command: {command}")
69
+
70
+ process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
71
+
72
+ for line in process.stdout:
73
+ print(line, end='')
74
+
75
+ process.wait()
76
+
77
+ if process.returncode != 0:
78
+ print(f"Error: Command exited with status {process.returncode}")
79
+
80
+ cleanup_command = "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"
81
+ subprocess.run(cleanup_command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
82
+
83
+ is_running = False
84
+
85
+ def read_logs():
86
+ with open(log_file, "r") as f:
87
+ return f.read()
88
+
89
+ with gr.Blocks() as demo:
90
+ gr.Markdown("# BigCodeBench Evaluation App")
91
+
92
+ with gr.Row():
93
+ jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"])
94
+ split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete")
95
+ subset = gr.Dropdown(choices=["full", "hard"], label="Subset", value="full")
96
+
97
+ with gr.Row():
98
+ save_pass_rate = gr.Checkbox(label="Save Pass Rate")
99
+ parallel = gr.Number(label="Parallel (optional)", precision=0)
100
+ min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1)
101
+ max_as_limit = gr.Number(label="Max AS Limit", value=128*1024, precision=0)
102
+
103
+ with gr.Row():
104
+ max_data_limit = gr.Number(label="Max Data Limit", value=4*1024, precision=0)
105
+ max_stack_limit = gr.Number(label="Max Stack Limit", value=5, precision=0)
106
+ check_gt_only = gr.Checkbox(label="Check GT Only")
107
+ no_gt = gr.Checkbox(label="No GT")
108
+
109
+ command_output = gr.Textbox(label="Command", lines=2, value=default_command, interactive=False)
110
+ submit_btn = gr.Button("Run Evaluation")
111
+ log_output = gr.Textbox(label="Execution Logs", lines=10)
112
+
113
+ def update_command(*args):
114
+ return generate_command(*args)
115
+
116
+ input_components = [
117
+ jsonl_file, split, subset, save_pass_rate, parallel,
118
+ min_time_limit, max_as_limit, max_data_limit, max_stack_limit,
119
+ check_gt_only, no_gt
120
+ ]
121
+
122
+ for component in input_components:
123
+ component.change(update_command, inputs=input_components, outputs=command_output)
124
+
125
+ def on_submit(command):
126
+ global is_running
127
+ if is_running:
128
+ return "A command is already running. Please wait for it to finish."
129
+
130
+ def run_and_update():
131
+ run_bigcodebench(command)
132
+ return read_logs()
133
+
134
+ return gr.update(value="Evaluation started. Please wait for the logs to update..."), gr.update(value=run_and_update)
135
+
136
+ submit_btn.click(on_submit, inputs=[command_output], outputs=[log_output, log_output])
137
+
138
+ if __name__ == "__main__":
139
+ demo.queue().launch()