Muennighoff commited on
Commit
0e7922f
1 Parent(s): ea58aa2
Files changed (2) hide show
  1. code_eval.py +2 -2
  2. execute.py +43 -2
code_eval.py CHANGED
@@ -152,7 +152,7 @@ class CodeEval(evaluate.Metric):
152
  license=_LICENSE,
153
  )
154
 
155
- def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
156
  """Returns the scores"""
157
 
158
  if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
@@ -170,7 +170,7 @@ class CodeEval(evaluate.Metric):
170
  for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
171
  for candidate in candidates:
172
  test_program = candidate + "\n" + test_case
173
- args = (test_program, timeout, task_id, completion_id[task_id])
174
  future = executor.submit(check_correctness, *args)
175
  futures.append(future)
176
  completion_id[task_id] += 1
 
152
  license=_LICENSE,
153
  )
154
 
155
+ def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0, language="python"):
156
  """Returns the scores"""
157
 
158
  if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
 
170
  for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
171
  for candidate in candidates:
172
  test_program = candidate + "\n" + test_case
173
+ args = (test_program, timeout, task_id, completion_id[task_id], language)
174
  future = executor.submit(check_correctness, *args)
175
  futures.append(future)
176
  completion_id[task_id] += 1
execute.py CHANGED
@@ -24,8 +24,12 @@ import platform
24
  import signal
25
  import tempfile
26
 
 
 
 
 
27
 
28
- def check_correctness(check_program, timeout, task_id, completion_id):
29
  """
30
  Evaluates the functional correctness of a completion by running the test
31
  suite provided in the problem.
@@ -36,7 +40,8 @@ def check_correctness(check_program, timeout, task_id, completion_id):
36
  manager = multiprocessing.Manager()
37
  result = manager.list()
38
 
39
- p = multiprocessing.Process(target=unsafe_execute, args=(check_program, result, timeout))
 
40
  p.start()
41
  p.join(timeout=timeout + 1)
42
  if p.is_alive():
@@ -85,6 +90,42 @@ def unsafe_execute(check_program, result, timeout):
85
  os.rmdir = rmdir
86
  os.chdir = chdir
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  @contextlib.contextmanager
90
  def time_limit(seconds):
 
24
  import signal
25
  import tempfile
26
 
27
+ LANGUAGE_TO_FUNC = {
28
+ "python": unsafe_execute,
29
+ "javascript": unsafe_execute_js,
30
+ }
31
 
32
+ def check_correctness(check_program, timeout, task_id, completion_id, language):
33
  """
34
  Evaluates the functional correctness of a completion by running the test
35
  suite provided in the problem.
 
40
  manager = multiprocessing.Manager()
41
  result = manager.list()
42
 
43
+ p = multiprocessing.Process(target=LANGUAGE_TO_FUNC[language], args=(check_program, result, timeout))
44
+
45
  p.start()
46
  p.join(timeout=timeout + 1)
47
  if p.is_alive():
 
90
  os.rmdir = rmdir
91
  os.chdir = chdir
92
 
93
+ def unsafe_execute_js(check_program, result, timeout):
94
+
95
+ with create_tempdir():
96
+
97
+ open(f"test.js", 'w').write(check_program)
98
+
99
+ # These system calls are needed when cleaning up tempdir.
100
+ import os
101
+ import shutil
102
+
103
+ rmtree = shutil.rmtree
104
+ rmdir = os.rmdir
105
+ chdir = os.chdir
106
+
107
+ # Run program.
108
+ try:
109
+ exec_globals = {}
110
+ with time_limit(timeout):
111
+ exec_result = subprocess.run(["node", "test.js"], timeout=timeout, capture_output=True)
112
+ if exec_result.stderr.decode():
113
+ err = exec_result.stderr.decode()
114
+ result.append(f"failed: {err}")
115
+ elif exec_result.stdout.decode():
116
+ err = exec_result.stdout.decode()
117
+ result.append(f"failed: {err}")
118
+ else:
119
+ result.append("passed")
120
+
121
+ except TimeoutException:
122
+ result.append("timed out")
123
+
124
+ # Needed for cleaning up.
125
+ shutil.rmtree = rmtree
126
+ os.rmdir = rmdir
127
+ os.chdir = chdir
128
+
129
 
130
  @contextlib.contextmanager
131
  def time_limit(seconds):