jjyang77 commited on
Commit
bb636ca
1 Parent(s): 25db7e9

add local_evaluator and some cleanup

Browse files
Files changed (4) hide show
  1. Dockerfile +2 -0
  2. README.md +8 -2
  3. local_evaluator.py +94 -0
  4. prod.sh +1 -1
Dockerfile CHANGED
@@ -20,6 +20,8 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser
20
 
21
  RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
22
 
 
 
23
  COPY . .
24
 
25
  WORKDIR /
 
20
 
21
  RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
22
 
23
+ RUN python -m nltk.downloader punkt
24
+
25
  COPY . .
26
 
27
  WORKDIR /
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: OE Eval Bcb Evaluator Testing
3
  emoji: 🐢
4
  colorFrom: green
5
  colorTo: pink
@@ -7,4 +7,10 @@ sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
1
  ---
2
+ title: OE Eval Bcb Evaluator lite
3
  emoji: 🐢
4
  colorFrom: green
5
  colorTo: pink
 
7
  pinned: false
8
  ---
9
 
10
+ # For local testing
11
+ Build the docker image for the BCB eval env.
12
+ There is a `scikit-image` wheel that takes a long-time to build ...
13
+
14
+ Run the container while mounting a data volume with your generated code solutions, and mapping a port to 7860.
15
+
16
+
local_evaluator.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import asyncio
3
+ import argparse
4
+ import httpx
5
+ from typing import List, Optional
6
+
7
+ _parser = argparse.ArgumentParser()
8
+
9
+ _parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl")
10
+ _parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval")
11
+
12
+
13
+ def load_jsonl(filename):
14
+ with open(filename, "r") as file:
15
+ return [json.loads(line.strip()) for line in file]
16
+
17
+ def save_jsonl(filename, data):
18
+ with open(filename, "w") as file:
19
+ for d in data:
20
+ file.write(json.dumps(d))
21
+ file.write("\n")
22
+ return filename
23
+
24
+ async def call_oe_eval_bcb_client(
25
+ samples_data: List[dict],
26
+ calibrate: bool = True,
27
+ parallel: int = -1,
28
+ min_time_limit: float = 1,
29
+ max_as_limit: int = 30 * 1024,
30
+ max_data_limit: int = 30 * 1024,
31
+ max_stack_limit: int = 10,
32
+ no_gt: bool = True,
33
+ execute_api: Optional[str] = None,
34
+ ) -> List[dict]:
35
+ """
36
+ OE-Eval BigCodeBench remote code execution API
37
+ """
38
+ if execute_api is None:
39
+ execute_api = "http://localhost:9000/evaluate/"
40
+
41
+ async with httpx.AsyncClient() as client:
42
+ params = {
43
+ "calibrate": calibrate,
44
+ "parallel": parallel,
45
+ "min_time_limit": min_time_limit,
46
+ "max_as_limit": max_as_limit,
47
+ "max_data_limit": max_data_limit,
48
+ "max_stack_limit": max_stack_limit,
49
+ "no_gt": no_gt,
50
+ }
51
+ # Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of
52
+ # generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec)
53
+ total_timeout = 900
54
+ response = await client.post(
55
+ execute_api, json=samples_data, params=params, timeout=total_timeout
56
+ )
57
+ results = response.json()
58
+
59
+ print("Results received from remote API. Processing ...")
60
+ check_results = []
61
+ for doc in results["eval"].values():
62
+ for rep in doc:
63
+ rep["tested_completion"] = rep.pop("solution")
64
+ rep["passed"] = rep.pop("status") == "pass"
65
+ rep["exec_result"] = rep.pop("details")
66
+ check_results.append(rep)
67
+ if check_results:
68
+ pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results)
69
+ return check_results, pass_at_1
70
+ else:
71
+ return None, None
72
+
73
+ def evaluate(sample_file, execute_api: Optional[str] = None):
74
+ batched_code_test = load_jsonl(sample_file)
75
+ results, pass_at_1 = asyncio.run(
76
+ call_oe_eval_bcb_client(
77
+ samples_data=batched_code_test,
78
+ calibrate=True,
79
+ parallel=-1,
80
+ min_time_limit=30,
81
+ execute_api = execute_api
82
+ )
83
+ )
84
+ print("pass@1:", pass_at_1)
85
+ return results
86
+
87
+ def main():
88
+ args = _parser.parse_args()
89
+ args_dict = vars(args)
90
+ results = evaluate(args_dict["filename"], args_dict["remoteapi"])
91
+ save_jsonl("data/eval_results.jsonl", results)
92
+
93
+ if __name__ == "__main__":
94
+ main()
prod.sh CHANGED
@@ -2,7 +2,7 @@
2
  exec \
3
  gunicorn \
4
  -k uvicorn.workers.UvicornWorker \
5
- --workers 2 \
6
  --timeout 0 \
7
  --bind 0.0.0.0:7860 \
8
  --enable-stdio-inheritance \
 
2
  exec \
3
  gunicorn \
4
  -k uvicorn.workers.UvicornWorker \
5
+ --workers 8 \
6
  --timeout 0 \
7
  --bind 0.0.0.0:7860 \
8
  --enable-stdio-inheritance \