Terry Zhuo commited on
Commit
7eeb535
1 Parent(s): 3204d18
Files changed (1) hide show
  1. app.py +89 -92
app.py CHANGED
@@ -134,117 +134,114 @@ def evaluate(
134
  gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
135
  failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
136
 
137
- if check_gt_only:
138
- if gt_pass_rate > 0.99:
139
- cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
140
- else:
141
- cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
142
- if len(failed_tasks) > 0:
143
- cprint(f"Failed tasks: {failed_tasks}", "red")
144
- return {"gt_pass_rate":float(gt_pass_rate), "failed_tasks": failed_tasks}
145
 
146
- results = {
147
- "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
148
- "eval": {},
149
- }
 
 
150
 
151
- with ProcessPoolExecutor(max_workers=n_workers) as executor:
152
- futures = []
153
- completion_id = Counter()
154
- n_samples = 0
155
- eval_results = defaultdict(list) # task_id ->
156
- remainings = set()
157
 
158
- print("Reading samples...")
159
- for sample in tqdm(load_solutions(samples)):
160
- task_id = sample["task_id"]
161
-
162
- if task_id not in problems:
163
- warn(
164
- f"Task {task_id} is found in the samples but not found in the dataset"
 
 
 
 
 
 
165
  )
166
- continue
167
- solution = (
168
- sample["solution"]
169
- if "solution" in sample
170
- else problems[task_id]["complete_prompt"] + sample["completion"]
171
- )
172
- if "sanitized-calibrated" in samples:
173
- solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
174
- remainings.add(sample["_identifier"])
175
- args = (
176
- completion_id[task_id],
177
- problems[task_id],
178
- solution,
179
- max_as_limit,
180
- max_data_limit,
181
- max_stack_limit,
182
- sample["_identifier"],
183
- min_time_limit,
184
- expected_time[task_id] if expected_time[task_id] else 20
185
- )
186
- futures.append(executor.submit(check_correctness, *args))
187
- completion_id[task_id] += 1
188
- n_samples += 1
189
 
190
- assert n_samples == len(remainings), "Missing problems in unfinished"
191
- assert len(completion_id) == len(problems), "Missing problems in samples"
192
 
193
- def stucking_checker():
194
- not_done = futures
195
- while len(not_done) > 0:
196
- done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
197
 
198
- if len(done) == 0:
199
- warn("No samples have finished testing in the last 240s")
200
- warn(f"{len(remainings)} samples to be tested: {remainings}")
201
 
202
- threading.Thread(target=stucking_checker).start()
203
 
204
- for future in tqdm(as_completed(futures), total=n_samples):
205
- result = future.result()
206
- remainings.remove(result["_identifier"])
207
- eval_results[result["task_id"]].append(result)
208
 
209
 
210
- # sort the results for each problem by completion_id
211
- for task_id, task_results in eval_results.items():
212
- task_results.sort(key=lambda x: x["completion_id"])
213
- results["eval"][task_id] = []
214
- for res in task_results:
215
- stat, details = res["base"]
216
- results["eval"][task_id].append(
217
- {
218
- "task_id": task_id,
219
- "solution": res["solution"],
220
- "status": stat,
221
- "details": details,
222
- }
223
- )
224
 
225
- # Calculate pass@k.
226
- total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
227
- base_correct = []
228
 
229
- for key, res in results["eval"].items():
230
- if key not in problems:
231
- continue
232
- bc = sum([r["status"] == PASS for r in res])
233
- base_correct.append(bc)
234
 
235
- base_correct = np.array(base_correct)
236
 
237
- pass_at_k = {
238
- f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
239
- for k in pass_k
240
- if total.min() >= k
241
- }
 
242
  pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
243
  pass_at_k["split"] = split
244
  pass_at_k["subset"] = subset
245
  pass_at_k["calibrated"] = "sanitized-calibrated" in samples
246
  pass_at_k["gt_pass_rate"] = gt_pass_rate
247
  pass_at_k["failed_tasks"] = failed_tasks
 
248
  return results, pass_at_k
249
 
250
 
@@ -252,8 +249,8 @@ def run_gradio():
252
  interface = gr.Interface(
253
  fn=evaluate,
254
  inputs=[
255
- gr.Dropdown(["complete", "instruct"], label="Split"),
256
- gr.Dropdown(["full", "hard"], label="Subset"),
257
  gr.File(label="Samples Path (.jsonl)"),
258
  gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
259
  gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),
 
134
  gt_pass_rate = np.mean([1 if v is not None else 0 for k, v in expected_time.items() if k in problems])
135
  failed_tasks = [k for k, v in expected_time.items() if v is None and k in problems]
136
 
137
+ pass_at_k = dict()
 
 
 
 
 
 
 
138
 
139
+ if not check_gt_only:
140
+
141
+ results = {
142
+ "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
143
+ "eval": {},
144
+ }
145
 
146
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
147
+ futures = []
148
+ completion_id = Counter()
149
+ n_samples = 0
150
+ eval_results = defaultdict(list) # task_id ->
151
+ remainings = set()
152
 
153
+ print("Reading samples...")
154
+ for sample in tqdm(load_solutions(samples)):
155
+ task_id = sample["task_id"]
156
+
157
+ if task_id not in problems:
158
+ warn(
159
+ f"Task {task_id} is found in the samples but not found in the dataset"
160
+ )
161
+ continue
162
+ solution = (
163
+ sample["solution"]
164
+ if "solution" in sample
165
+ else problems[task_id]["complete_prompt"] + sample["completion"]
166
  )
167
+ if "sanitized-calibrated" in samples:
168
+ solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution
169
+ remainings.add(sample["_identifier"])
170
+ args = (
171
+ completion_id[task_id],
172
+ problems[task_id],
173
+ solution,
174
+ max_as_limit,
175
+ max_data_limit,
176
+ max_stack_limit,
177
+ sample["_identifier"],
178
+ min_time_limit,
179
+ expected_time[task_id] if expected_time[task_id] else 20
180
+ )
181
+ futures.append(executor.submit(check_correctness, *args))
182
+ completion_id[task_id] += 1
183
+ n_samples += 1
 
 
 
 
 
 
184
 
185
+ assert n_samples == len(remainings), "Missing problems in unfinished"
186
+ assert len(completion_id) == len(problems), "Missing problems in samples"
187
 
188
+ # def stucking_checker():
189
+ # not_done = futures
190
+ # while len(not_done) > 0:
191
+ # done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
192
 
193
+ # if len(done) == 0:
194
+ # warn("No samples have finished testing in the last 240s")
195
+ # warn(f"{len(remainings)} samples to be tested: {remainings}")
196
 
197
+ # threading.Thread(target=stucking_checker).start()
198
 
199
+ for future in tqdm(as_completed(futures), total=n_samples):
200
+ result = future.result()
201
+ remainings.remove(result["_identifier"])
202
+ eval_results[result["task_id"]].append(result)
203
 
204
 
205
+ # sort the results for each problem by completion_id
206
+ for task_id, task_results in eval_results.items():
207
+ task_results.sort(key=lambda x: x["completion_id"])
208
+ results["eval"][task_id] = []
209
+ for res in task_results:
210
+ stat, details = res["base"]
211
+ results["eval"][task_id].append(
212
+ {
213
+ "task_id": task_id,
214
+ "solution": res["solution"],
215
+ "status": stat,
216
+ "details": details,
217
+ }
218
+ )
219
 
220
+ # Calculate pass@k.
221
+ total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
222
+ base_correct = []
223
 
224
+ for key, res in results["eval"].items():
225
+ if key not in problems:
226
+ continue
227
+ bc = sum([r["status"] == PASS for r in res])
228
+ base_correct.append(bc)
229
 
230
+ base_correct = np.array(base_correct)
231
 
232
+ pass_at_k.update({
233
+ f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
234
+ for k in pass_k
235
+ if total.min() >= k
236
+ })
237
+
238
  pass_at_k["model"] = os.path.basename(samples).split("--bigcodebench-")[0]
239
  pass_at_k["split"] = split
240
  pass_at_k["subset"] = subset
241
  pass_at_k["calibrated"] = "sanitized-calibrated" in samples
242
  pass_at_k["gt_pass_rate"] = gt_pass_rate
243
  pass_at_k["failed_tasks"] = failed_tasks
244
+
245
  return results, pass_at_k
246
 
247
 
 
249
  interface = gr.Interface(
250
  fn=evaluate,
251
  inputs=[
252
+ gr.Dropdown(["complete", "instruct"], label="BigCodeBench Split"),
253
+ gr.Dropdown(["full", "hard"], label="BigCodeBench Subset"),
254
  gr.File(label="Samples Path (.jsonl)"),
255
  gr.Textbox(label="Pass k Values (comma-separated)", value="1,5,10"),
256
  gr.Slider(1, multiprocessing.cpu_count(), step=1, label="Parallel Workers"),