jjyang77 commited on
Commit
0f87dc1
1 Parent(s): f5be131

app first pass

Browse files
Files changed (11) hide show
  1. .dockerignore +11 -0
  2. .gitignore +9 -0
  3. Dockerfile +35 -0
  4. README.md +1 -1
  5. api/__init__.py +0 -0
  6. api/app.py +158 -0
  7. api/bigcodebench_data.py +46 -0
  8. api/code_execution.py +524 -0
  9. dev.sh +11 -0
  10. prod.sh +10 -0
  11. requirements.txt +0 -74
.dockerignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .dockerignore
2
+
3
+ # Python cache files
4
+ **.pyc
5
+ **/__pycache__
6
+
7
+ # Testing data
8
+ /data
9
+
10
+ # Environment file
11
+ .env
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache files
2
+ **.pyc
3
+ **/__pycache__
4
+
5
+ # Testing data
6
+ /data
7
+
8
+ # Environment file
9
+ .env
Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Better use newer Python as generated code can use new features
2
+ FROM python:3.10-slim
3
+
4
+ # install git, g++ and python3-tk
5
+ RUN apt-get update && apt-get install -y git g++ python3-tk zip unzip procps r-base
6
+
7
+ # upgrade to latest pip
8
+ RUN pip install --upgrade pip
9
+
10
+ # Add a new user "bigcodebenchuser"
11
+ #RUN adduser --disabled-password --gecos "" bigcodebenchuser
12
+
13
+ #RUN rm -rf /bigcodebench
14
+
15
+ # Acquire benchmark code to local
16
+ # ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
17
+ # RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
18
+
19
+ #RUN cd /bigcodebench
20
+
21
+ # Pre-install the dataset
22
+ #RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench()"
23
+
24
+ RUN pip install fastapi gunicorn uvicorn[standard] httpx #pydantic==2.*
25
+
26
+ RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
27
+
28
+ COPY . .
29
+
30
+ WORKDIR /
31
+
32
+ # Start the FastAPI app on port 7860, the default port expected by Spaces
33
+ # CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "7860"]
34
+ # ENTRYPOINT [ "./dev.sh" ]
35
+ ENTRYPOINT [ "./prod.sh" ]
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Bcb Evaluator Testing
3
  emoji: 🐢
4
  colorFrom: green
5
  colorTo: pink
 
1
  ---
2
+ title: OE Eval Bcb Evaluator Testing
3
  emoji: 🐢
4
  colorFrom: green
5
  colorTo: pink
api/__init__.py ADDED
File without changes
api/app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from collections import Counter, defaultdict
4
+ import multiprocessing
5
+ from datetime import datetime
6
+ from concurrent.futures import ProcessPoolExecutor, as_completed
7
+ from typing import Dict, List, Tuple
8
+ import gc
9
+
10
+ from fastapi import FastAPI
11
+ from fastapi.responses import RedirectResponse
12
+
13
+ from api.bigcodebench_data import load_solutions
14
+ from api.code_execution import untrusted_check
15
+
16
+ Result = Tuple[str, List[bool]]
17
+
18
+ def create_app() -> FastAPI:
19
+
20
+ level = os.environ.get("LOG_LEVEL", default=logging.INFO)
21
+ logging.basicConfig(level=level)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ app = FastAPI()
25
+
26
+ @app.get("/")
27
+ def root():
28
+ return RedirectResponse("/docs")
29
+
30
+ @app.get("/health", status_code=204)
31
+ def health():
32
+ return
33
+
34
+ @app.post("/evaluate/")
35
+ async def evaluate(
36
+ samples: str,
37
+ parallel: int = -1,
38
+ min_time_limit: float = 1,
39
+ max_as_limit: int = 30 * 1024,
40
+ max_data_limit: int = 30 * 1024,
41
+ max_stack_limit: int = 10,
42
+ no_gt: bool = True,
43
+ ) -> dict:
44
+ """
45
+ Evaluate the correctness of the solutions in the given samples file.
46
+ """
47
+ if parallel < 1:
48
+ n_workers = max(1, multiprocessing.cpu_count() // 2)
49
+ else:
50
+ n_workers = parallel
51
+
52
+ if not no_gt:
53
+ expected_time = get_groundtruth()
54
+ else:
55
+ expected_time = {}
56
+
57
+ results = {
58
+ "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
59
+ "eval": {},
60
+ }
61
+
62
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
63
+ futures = []
64
+ completion_id = Counter()
65
+ n_samples = 0
66
+ eval_results = defaultdict(list) # task_id ->
67
+ remainings = set()
68
+
69
+ for sample in load_solutions(samples):
70
+ task_id = sample["task_id"]
71
+
72
+ solution = sample["solution"]
73
+
74
+ if "sanitized-calibrated" in samples:
75
+ solution = sample["code_prompt"] + "\n pass\n" + solution
76
+ remainings.add(sample["_identifier"])
77
+ args = (
78
+ completion_id[task_id],
79
+ sample["res_id"],
80
+ task_id,
81
+ solution,
82
+ sample["test"],
83
+ sample["entry_point"],
84
+ max_as_limit,
85
+ max_data_limit,
86
+ max_stack_limit,
87
+ sample["_identifier"],
88
+ min_time_limit,
89
+ expected_time.get(task_id) if expected_time.get(task_id) else 20
90
+ )
91
+ futures.append(executor.submit(check_correctness, *args))
92
+ completion_id[task_id] += 1
93
+ n_samples += 1
94
+
95
+ assert n_samples == len(remainings), "Missing problems in unfinished"
96
+ #assert len(completion_id) == len(problems), "Missing problems in samples"
97
+
98
+ for future in as_completed(futures):
99
+ result = future.result()
100
+ remainings.remove(result["_identifier"])
101
+ eval_results[result["task_id"]].append(result)
102
+ del future, result
103
+ gc.collect()
104
+
105
+ # sort the results for each problem by completion_id
106
+ for task_id, task_results in eval_results.items():
107
+ task_results.sort(key=lambda x: x["completion_id"])
108
+ results["eval"][task_id] = []
109
+ for res in task_results:
110
+ stat, details = res["base"]
111
+ results["eval"][task_id].append(
112
+ {
113
+ "res_id": res["res_id"],
114
+ "task_id": task_id,
115
+ "solution": res["solution"],
116
+ "status": stat,
117
+ "details": details,
118
+ }
119
+ )
120
+ return results
121
+
122
+ return app
123
+
124
+ def check_correctness(
125
+ completion_id: int,
126
+ res_id: int,
127
+ task_id: str,
128
+ solution: str,
129
+ test: str,
130
+ entry_point: str,
131
+ max_as_limit: float,
132
+ max_data_limit: float,
133
+ max_stack_limit: float,
134
+ identifier=None,
135
+ min_time_limit: float = 0.1,
136
+ gt_time_limit: float = 2.0,
137
+ ) -> Dict[str, Result]:
138
+ ret = {
139
+ "completion_id": completion_id,
140
+ "res_id": res_id,
141
+ "task_id": task_id,
142
+ "_identifier": identifier,
143
+ "solution": solution,
144
+ }
145
+ ret["base"] = untrusted_check(
146
+ solution,
147
+ test,
148
+ entry_point,
149
+ max_as_limit,
150
+ max_data_limit,
151
+ max_stack_limit,
152
+ min_time_limit,
153
+ gt_time_limit,
154
+ )
155
+ return ret
156
+
157
+ def get_groundtruth():
158
+ raise NotImplementedError("Groundtruth execution is not implemented yet.")
api/bigcodebench_data.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import gzip
4
+ from typing import Dict, Iterable
5
+
6
+ def stream_jsonl(filename: str) -> Iterable[Dict]:
7
+ """
8
+ Parses each jsonl line and yields it as a dictionary
9
+ """
10
+ if filename.endswith(".gz"):
11
+ with open(filename, "rb") as gzfp:
12
+ with gzip.open(gzfp, "rt") as fp:
13
+ for line in fp:
14
+ if any(not x.isspace() for x in line):
15
+ yield json.loads(line)
16
+ else:
17
+ with open(filename, "r") as fp:
18
+ for line in fp:
19
+ if any(not x.isspace() for x in line):
20
+ yield json.loads(line)
21
+
22
+
23
+ def load_solutions(sample_path: os.PathLike) -> Iterable[Dict]:
24
+ """We accept two formats of inputs.
25
+ + `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}.
26
+ + A folder which contains sub-folders named after the task_id. Each sub-folder
27
+ contains samples named in `[?].py` where `?` is the solution id starting with 0.
28
+ Different from `sample.jsonl`, the solutions must be complete (with prompt prefix).
29
+ """
30
+ # if it is a file
31
+ if os.path.isfile(sample_path):
32
+ for i, sample in enumerate(stream_jsonl(sample_path)):
33
+ assert "task_id" in sample, "No task_id found in sample!"
34
+ assert "res_id" in sample, "No res_id found in sample!"
35
+ assert "test" in sample, "No test found in sample!"
36
+ assert "solution" in sample, "No solution found in sample!"
37
+ assert isinstance(
38
+ sample["solution"], str
39
+ ), "Solution must be a string! If you have multiple solutions, please repeat the task_id."
40
+
41
+ sample["_identifier"] = (
42
+ sample["task_id"] + f" (line {i+1} in {sample_path})"
43
+ )
44
+ yield sample
45
+ else:
46
+ raise NotImplementedError("Only jsonl solution output file is supported for now.")
api/code_execution.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The MIT License
2
+ #
3
+ # Copyright (c) OpenAI (https://openai.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ import contextlib
24
+ import faulthandler
25
+ import tempfile
26
+ import platform
27
+ import itertools
28
+ import io
29
+ import os
30
+ import sys
31
+ import time
32
+ import types
33
+ import unittest
34
+ import subprocess
35
+ import signal
36
+ import multiprocessing
37
+ from multiprocessing import Value, Manager
38
+ from typing import List, Tuple, Union
39
+
40
+ import numpy as np
41
+
42
+ TIMEOUT_LIMIT=30.0 # BCB default is 240.0
43
+
44
+
45
+ @contextlib.contextmanager
46
+ def swallow_subprocess_output():
47
+ """Context manager to swallow stdout and stderr for subprocesses."""
48
+ original_popen = subprocess.Popen
49
+ original_run = subprocess.run
50
+
51
+ def _popen_patch(*args, **kwargs):
52
+ if 'capture_output' in kwargs and kwargs['capture_output']:
53
+ # Avoid setting stdout or stderr if capture_output is True
54
+ kwargs.pop('stdout', None)
55
+ kwargs.pop('stderr', None)
56
+ else:
57
+ kwargs.setdefault('stdout', subprocess.PIPE)
58
+ kwargs.setdefault('stderr', subprocess.PIPE)
59
+ return original_popen(*args, **kwargs)
60
+
61
+ def _run_patch(*args, **kwargs):
62
+ if 'capture_output' in kwargs and kwargs['capture_output']:
63
+ # Avoid setting stdout or stderr if capture_output is True
64
+ kwargs.pop('stdout', None)
65
+ kwargs.pop('stderr', None)
66
+ else:
67
+ kwargs.setdefault('stdout', subprocess.PIPE)
68
+ kwargs.setdefault('stderr', subprocess.PIPE)
69
+ return original_run(*args, **kwargs)
70
+
71
+ subprocess.Popen = _popen_patch
72
+ subprocess.run = _run_patch
73
+ try:
74
+ yield
75
+ finally:
76
+ subprocess.Popen = original_popen
77
+ subprocess.run = original_run
78
+
79
+
80
+ @contextlib.contextmanager
81
+ def swallow_io():
82
+ stream = WriteOnlyStringIO()
83
+ with contextlib.redirect_stdout(stream):
84
+ with contextlib.redirect_stderr(stream):
85
+ with redirect_stdin(stream):
86
+ with swallow_subprocess_output():
87
+ yield
88
+
89
+
90
+ @contextlib.contextmanager
91
+ def time_limit(seconds: float):
92
+ def signal_handler(signum, frame):
93
+ raise TimeoutException("Timed out!")
94
+
95
+ signal.setitimer(signal.ITIMER_REAL, seconds)
96
+ signal.signal(signal.SIGALRM, signal_handler)
97
+ try:
98
+ yield
99
+ finally:
100
+ signal.setitimer(signal.ITIMER_REAL, 0)
101
+
102
+
103
+ @contextlib.contextmanager
104
+ def create_tempdir():
105
+ with tempfile.TemporaryDirectory() as dirname:
106
+ with chdir(dirname):
107
+ yield dirname
108
+
109
+
110
+ @contextlib.contextmanager
111
+ def chdir(root):
112
+ if root == ".":
113
+ yield
114
+ return
115
+ cwd = os.getcwd()
116
+ os.chdir(root)
117
+ try:
118
+ yield
119
+ except BaseException as exc:
120
+ raise exc
121
+ finally:
122
+ os.chdir(cwd)
123
+
124
+
125
+ @contextlib.contextmanager
126
+ def safe_environment():
127
+ # Save original functions
128
+ original_kill = os.kill
129
+ original_killpg = os.killpg
130
+ original_system = os.system
131
+ original_subprocess_call = subprocess.call
132
+ original_subprocess_check_output = subprocess.check_output
133
+ original_subprocess_run = subprocess.run
134
+ original_subprocess_popen = subprocess.Popen
135
+ original_os_popen = os.popen
136
+ original_os_execv = os.execv
137
+ original_os_execvp = os.execvp
138
+ original_os_execvpe = os.execvpe
139
+
140
+ current_pid = os.getpid()
141
+ current_pgid = os.getpgid(current_pid)
142
+ manager = multiprocessing.Manager()
143
+ child_pids = manager.list()
144
+
145
+ def safe_kill(pid, sig):
146
+ try:
147
+ pgid = os.getpgid(pid)
148
+ if pid == current_pid or pid in child_pids:
149
+ original_kill(pid, sig)
150
+ else:
151
+ print(f"Prevented attempt to kill PID {pid} with signal {sig}")
152
+ except ProcessLookupError:
153
+ pass
154
+
155
+ def safe_killpg(pgid, sig):
156
+ if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}:
157
+ original_killpg(pgid, sig)
158
+ else:
159
+ print(f"Prevented attempt to kill PGID {pgid} with signal {sig}")
160
+
161
+ def safe_system(command):
162
+ print(f"Intercepted system command: {command}")
163
+ if 'kill' in command or 'killall' in command:
164
+ return 0 # Simulate successful execution without doing anything
165
+ return original_system(command)
166
+
167
+ def safe_subprocess_call(command, *args, **kwargs):
168
+ print(f"Intercepted subprocess call: {command}")
169
+ if 'kill' in command or 'killall' in command:
170
+ return 0 # Simulate successful execution without doing anything
171
+ return original_subprocess_call(command, *args, **kwargs)
172
+
173
+ def safe_subprocess_check_output(command, *args, **kwargs):
174
+ print(f"Intercepted command: {command}")
175
+ if 'ps' in command:
176
+ return b"" # Simulate no processes found
177
+ return original_subprocess_check_output(command, *args, **kwargs)
178
+
179
+ def safe_subprocess_run(*args, **kwargs):
180
+ print(f"Intercepted subprocess run command: {args}")
181
+ if 'kill' in args[0] or 'killall' in args[0]:
182
+ return subprocess.CompletedProcess(args, 0, b'', b'') # Simulate successful execution
183
+ return original_subprocess_run(*args, **kwargs)
184
+
185
+ class SafePopen(subprocess.Popen):
186
+ def __init__(self, *args, **kwargs):
187
+ print(f"Intercepted Popen command: {args}")
188
+ kwargs['preexec_fn'] = os.setsid # Start the process in a new session
189
+ super().__init__(*args, **kwargs)
190
+ child_pids.append(self.pid)
191
+
192
+ def communicate(self, *args, **kwargs):
193
+ try:
194
+ return super().communicate(*args, **kwargs)
195
+ except subprocess.TimeoutExpired:
196
+ print("Timeout expired, intercepted and returning None")
197
+ return None, None
198
+
199
+ def kill(self):
200
+ print(f"Intercepted kill call for PID {self.pid}")
201
+ safe_kill(self.pid, signal.SIGTERM)
202
+
203
+ def terminate(self):
204
+ print(f"Intercepted terminate call for PID {self.pid}")
205
+ safe_kill(self.pid, signal.SIGTERM)
206
+
207
+ def safe_os_popen(command):
208
+ print(f"Intercepted os.popen command: {command}")
209
+ if 'kill' in command or 'killall' in command:
210
+ return os.popen('echo Intercepted')
211
+ return original_os_popen(command)
212
+
213
+ def safe_exec(*args, **kwargs):
214
+ print(f"Intercepted exec command: {args}")
215
+
216
+ # Override the risky functions with the safe versions
217
+ os.kill = safe_kill
218
+ os.killpg = safe_killpg
219
+ os.system = safe_system
220
+ subprocess.call = safe_subprocess_call
221
+ subprocess.check_output = safe_subprocess_check_output
222
+ subprocess.run = safe_subprocess_run
223
+ subprocess.Popen = SafePopen
224
+ os.popen = safe_os_popen
225
+ os.execv = safe_exec
226
+ os.execvp = safe_exec
227
+ os.execvpe = safe_exec
228
+
229
+ try:
230
+ yield
231
+ finally:
232
+ for pid in child_pids:
233
+ try:
234
+ os.kill(pid, signal.SIGTERM)
235
+ for _ in range(10):
236
+ time.sleep(0.1)
237
+ try:
238
+ os.kill(pid, 0)
239
+ except ProcessLookupError:
240
+ break
241
+ else:
242
+ os.kill(pid, signal.SIGKILL)
243
+ except ProcessLookupError:
244
+ pass
245
+ except Exception as e:
246
+ print(f"Error handling process {pid}: {e}")
247
+
248
+ os.kill = original_kill
249
+ os.killpg = original_killpg
250
+ os.system = original_system
251
+ subprocess.call = original_subprocess_call
252
+ subprocess.check_output = original_subprocess_check_output
253
+ subprocess.run = original_subprocess_run
254
+ subprocess.Popen = original_subprocess_popen
255
+ os.popen = original_os_popen
256
+ os.execv = original_os_execv
257
+ os.execvp = original_os_execvp
258
+ os.execvpe = original_os_execvpe
259
+
260
+
261
+ class TimeoutException(Exception):
262
+ pass
263
+
264
+
265
+ class WriteOnlyStringIO(io.StringIO):
266
+ """StringIO that throws an exception when it's read from"""
267
+
268
+ def read(self, *args, **kwargs):
269
+ raise IOError
270
+
271
+ def readline(self, *args, **kwargs):
272
+ raise IOError
273
+
274
+ def readlines(self, *args, **kwargs):
275
+ raise IOError
276
+
277
+ def readable(self, *args, **kwargs):
278
+ """Returns True if the IO object can be read."""
279
+ return False
280
+
281
+
282
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
283
+ _stream = "stdin"
284
+
285
+
286
+ def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
287
+ """
288
+ This disables various destructive functions and prevents the generated code
289
+ from interfering with the test (e.g. fork bomb, killing other processes,
290
+ removing filesystem files, etc.)
291
+
292
+ WARNING
293
+ This function is NOT a security sandbox. Untrusted code, including, model-
294
+ generated code, should not be blindly executed outside of one. See the
295
+ Codex paper for more information about OpenAI's code sandbox, and proceed
296
+ with caution.
297
+ """
298
+
299
+ import os
300
+ import time
301
+ from datetime import datetime
302
+
303
+ os.environ['TZ'] = 'UTC'
304
+ time.tzset()
305
+
306
+ os.environ["OMP_NUM_THREADS"] = "1"
307
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
308
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
309
+
310
+ if max_as_limit and max_data_limit and max_stack_limit:
311
+ import resource
312
+
313
+ max_as_limit = max_as_limit * 1024 * 1024
314
+ max_data_limit = max_data_limit * 1024 * 1024
315
+ max_stack_limit = max_stack_limit * 1024 * 1024
316
+
317
+ resource.setrlimit(
318
+ resource.RLIMIT_AS, (max_as_limit, max_as_limit)
319
+ )
320
+ resource.setrlimit(
321
+ resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
322
+ )
323
+ if not platform.uname().system == "Darwin":
324
+ resource.setrlimit(
325
+ resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
326
+ )
327
+
328
+ faulthandler.disable()
329
+
330
+ import builtins
331
+
332
+ builtins.exit = None
333
+ builtins.quit = None
334
+
335
+ import matplotlib.pyplot as plt
336
+ plt.close('all')
337
+
338
+
339
+ # unbiased estimator from https://github.com/openai/human-eval
340
+ def estimate_pass_at_k(
341
+ num_samples: Union[int, List[int], np.ndarray],
342
+ num_correct: Union[List[int], np.ndarray],
343
+ k: int,
344
+ ) -> np.ndarray:
345
+ """
346
+ Estimates pass@k of each problem and returns them in an array.
347
+ """
348
+
349
+ def estimator(n: int, c: int, k: int) -> float:
350
+ """
351
+ Calculates 1 - comb(n - c, k) / comb(n, k).
352
+ """
353
+ if n - c < k:
354
+ return 1.0
355
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
356
+
357
+ if isinstance(num_samples, int):
358
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
359
+ else:
360
+ assert len(num_samples) == len(num_correct)
361
+ num_samples_it = iter(num_samples)
362
+
363
+ return np.array(
364
+ [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
365
+ )
366
+
367
+
368
+ PASS = "pass"
369
+ FAIL = "fail"
370
+ TIMEOUT = "timeout"
371
+
372
+ _SUCCESS = 0
373
+ _FAILED = 1
374
+ _TIMEOUT = 2
375
+ _UNKNOWN = 3
376
+
377
+ _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
378
+
379
+
380
+ def is_floats(x) -> bool:
381
+ # check if it is float; List[float]; Tuple[float]
382
+ if isinstance(x, float):
383
+ return True
384
+ if isinstance(x, (list, tuple)):
385
+ return all(isinstance(i, float) for i in x)
386
+ if isinstance(x, np.ndarray):
387
+ return x.dtype == np.float64 or x.dtype == np.float32
388
+ return False
389
+
390
+
391
+ def unsafe_execute(
392
+ entry_point: str,
393
+ code: str,
394
+ test_code: str,
395
+ timeout: float,
396
+ max_as_limit: float,
397
+ max_data_limit: float,
398
+ max_stack_limit: float,
399
+ stat, # Value
400
+ details, # Array
401
+ ):
402
+ with safe_environment(), create_tempdir():
403
+ # These system calls are needed when cleaning up tempdir.
404
+ import os
405
+ import shutil
406
+ import builtins
407
+
408
+ rmtree = shutil.rmtree
409
+ rmdir = os.rmdir
410
+ chdir = os.chdir
411
+ # Disable functionalities that can make destructive changes to the test.
412
+ reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
413
+ module_name = "__test__"
414
+ new_module = types.ModuleType(module_name)
415
+ # Set necessary attributes for the module
416
+ new_module.__dict__.update({
417
+ '__builtins__': builtins,
418
+ '__file__': f"{module_name}.py",
419
+ '__package__': None,
420
+ '__doc__': None,
421
+ 'sys': sys,
422
+ 'os': os,
423
+ 'environ': os.environ,
424
+ })
425
+
426
+ try:
427
+ full_code = code + "\n" + test_code
428
+
429
+ with swallow_io():
430
+ exec(compile(full_code, f"{module_name}.py", 'exec'), new_module.__dict__)
431
+ sys.modules[module_name] = new_module
432
+ TestCases = getattr(new_module, 'TestCases')
433
+ loader = unittest.TestLoader()
434
+ suite = loader.loadTestsFromTestCase(TestCases)
435
+ test_result = unittest.TestResult()
436
+ start_time = time.time()
437
+ with time_limit(timeout):
438
+ suite.run(test_result)
439
+
440
+ issues = test_result.failures + test_result.errors
441
+ for test, trace in issues:
442
+ details[test.id().split(".")[-1]] = trace
443
+ stat.value = _SUCCESS
444
+ except BaseException as e:
445
+ details["ALL"] = str(e)
446
+ stat.value = _FAILED
447
+ # Needed for cleaning up.
448
+ shutil.rmtree = rmtree
449
+ os.rmdir = rmdir
450
+ os.chdir = chdir
451
+
452
+
453
+ def untrusted_check(
454
+ code: str,
455
+ test_code: str,
456
+ entry_point: str,
457
+ max_as_limit: float,
458
+ max_data_limit: float,
459
+ max_stack_limit: float,
460
+ min_time_limit: float = 10,
461
+ gt_time_limit: float = 60
462
+ ) -> Tuple[str, np.ndarray]:
463
+ min_time_limit = max(min_time_limit, gt_time_limit)
464
+ timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
465
+ # shared memory objects
466
+ stat = Value("i", _UNKNOWN)
467
+ manager = Manager()
468
+ details = manager.dict()
469
+
470
+ p = multiprocessing.Process(
471
+ target=unsafe_execute,
472
+ args=(
473
+ entry_point,
474
+ code,
475
+ test_code,
476
+ timeout,
477
+ max_as_limit,
478
+ max_data_limit,
479
+ max_stack_limit,
480
+ stat,
481
+ details,
482
+ ),
483
+ )
484
+ p.start()
485
+ p.join(timeout=timeout+1)
486
+ if p.is_alive():
487
+ p.terminate()
488
+ time.sleep(0.1)
489
+ if p.is_alive():
490
+ p.kill()
491
+ time.sleep(0.1)
492
+
493
+ stat = _mapping[stat.value]
494
+ # convert details to a dict
495
+ details = dict(details)
496
+
497
+ if not stat:
498
+ stat = TIMEOUT
499
+ if stat == PASS:
500
+ if details:
501
+ stat = FAIL
502
+
503
+ return stat, details
504
+
505
+
506
+ def evaluate_files(
507
+ files: List[str],
508
+ inputs: List,
509
+ entry_point: str,
510
+ min_time_limit: float = 0.1,
511
+ gt_time_limit_factor: float = 2.0,
512
+ ) -> List[Tuple[str, List[bool]]]:
513
+ ret = []
514
+ # sort files by the id in name (i.e., "../n.py")
515
+ files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0]))
516
+ for file in files:
517
+ code = open(file, "r").read()
518
+ stat, det = untrusted_check(
519
+ code,
520
+ inputs,
521
+ entry_point,
522
+ )
523
+ ret.append((stat, det.tolist()))
524
+ return ret
dev.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ exec \
3
+ gunicorn \
4
+ -k uvicorn.workers.UvicornWorker \
5
+ --workers 1 \
6
+ --timeout 0 \
7
+ --bind 0.0.0.0:7860 \
8
+ --enable-stdio-inheritance \
9
+ --access-logfile - \
10
+ --reload \
11
+ 'api.app:create_app()'
prod.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ exec \
3
+ gunicorn \
4
+ -k uvicorn.workers.UvicornWorker \
5
+ --workers 2 \
6
+ --timeout 0 \
7
+ --bind 0.0.0.0:7860 \
8
+ --enable-stdio-inheritance \
9
+ --access-logfile - \
10
+ 'api.app:create_app()'
requirements.txt DELETED
@@ -1,74 +0,0 @@
1
- beautifulsoup4==4.8.2
2
- blake3==0.4.1
3
- chardet==5.2.0
4
- cryptography==38.0.0
5
- datetime==5.5
6
- Django==4.2.7
7
- dnspython==2.6.1
8
- docxtpl==0.11.5
9
- Faker==20.1.0
10
- flask_login==0.6.3
11
- flask_restful==0.3.10
12
- flask_wtf==1.2.1
13
- Flask-Mail==0.9.1
14
- flask==3.0.3
15
- folium==0.16.0
16
- gensim==4.3.2
17
- geopandas==0.13.2
18
- geopy==2.4.1
19
- holidays==0.29
20
- keras==2.11.0
21
- Levenshtein==0.25.0
22
- librosa==0.10.1
23
- lxml==4.9.3
24
- matplotlib==3.7.0
25
- mechanize==0.4.9
26
- natsort==7.1.1
27
- networkx==2.6.3
28
- nltk==3.8
29
- numba==0.55.0
30
- numpy==1.21.2
31
- opencv-python-headless==4.9.0.80
32
- openpyxl==3.1.2
33
- pandas==2.0.3
34
- Pillow==10.3.0
35
- prettytable==3.10.0
36
- psutil==5.9.5
37
- pycryptodome==3.14.1
38
- pyfakefs==5.4.1
39
- pyquery==1.4.3
40
- pytesseract==0.3.10
41
- pytest==8.2.0
42
- python_http_client==3.3.7
43
- python-dateutil==2.9.0
44
- python-docx==1.1.0
45
- python-Levenshtein-wheels
46
- pytz==2023.3.post1
47
- PyYAML==6.0.1
48
- requests_mock==1.11.0
49
- requests==2.31.0
50
- Requests==2.31.0
51
- rsa==4.9
52
- scikit-image==0.18.0
53
- scikit-learn==1.3.1
54
- scipy==1.7.2
55
- seaborn==0.13.2
56
- selenium==4.15
57
- sendgrid==6.11.0
58
- shapely==2.0.4
59
- soundfile==0.12.1
60
- statsmodels==0.14.0
61
- statsmodels==0.14.0
62
- sympy==1.12
63
- tensorflow==2.11.1
64
- textblob==0.18.0
65
- texttable==1.7.0
66
- Werkzeug==3.0.1
67
- wikipedia==1.4.0
68
- wordcloud==1.9.3
69
- wordninja==2.0.0
70
- WTForms==3.1.2
71
- xlrd==2.0.1
72
- xlrd==2.0.1
73
- xlwt==1.3.0
74
- xmltodict==0.13.0