Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
jjyang77
commited on
Commit
•
bb636ca
1
Parent(s):
25db7e9
add local_evaluator and some cleanup
Browse files- Dockerfile +2 -0
- README.md +8 -2
- local_evaluator.py +94 -0
- prod.sh +1 -1
Dockerfile
CHANGED
@@ -20,6 +20,8 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser
|
|
20 |
|
21 |
RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
|
22 |
|
|
|
|
|
23 |
COPY . .
|
24 |
|
25 |
WORKDIR /
|
|
|
20 |
|
21 |
RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
|
22 |
|
23 |
+
RUN python -m nltk.downloader punkt
|
24 |
+
|
25 |
COPY . .
|
26 |
|
27 |
WORKDIR /
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: OE Eval Bcb Evaluator
|
3 |
emoji: 🐢
|
4 |
colorFrom: green
|
5 |
colorTo: pink
|
@@ -7,4 +7,10 @@ sdk: docker
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: OE Eval Bcb Evaluator lite
|
3 |
emoji: 🐢
|
4 |
colorFrom: green
|
5 |
colorTo: pink
|
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
+
# For local testing
|
11 |
+
Build the docker image for the BCB eval env.
|
12 |
+
There is a `scikit-image` wheel that takes a long-time to build ...
|
13 |
+
|
14 |
+
Run the container while mounting a data volume with your generated code solutions, and mapping a port to 7860.
|
15 |
+
|
16 |
+
|
local_evaluator.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import asyncio
|
3 |
+
import argparse
|
4 |
+
import httpx
|
5 |
+
from typing import List, Optional
|
6 |
+
|
7 |
+
_parser = argparse.ArgumentParser()
|
8 |
+
|
9 |
+
_parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl")
|
10 |
+
_parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval")
|
11 |
+
|
12 |
+
|
13 |
+
def load_jsonl(filename):
|
14 |
+
with open(filename, "r") as file:
|
15 |
+
return [json.loads(line.strip()) for line in file]
|
16 |
+
|
17 |
+
def save_jsonl(filename, data):
|
18 |
+
with open(filename, "w") as file:
|
19 |
+
for d in data:
|
20 |
+
file.write(json.dumps(d))
|
21 |
+
file.write("\n")
|
22 |
+
return filename
|
23 |
+
|
24 |
+
async def call_oe_eval_bcb_client(
|
25 |
+
samples_data: List[dict],
|
26 |
+
calibrate: bool = True,
|
27 |
+
parallel: int = -1,
|
28 |
+
min_time_limit: float = 1,
|
29 |
+
max_as_limit: int = 30 * 1024,
|
30 |
+
max_data_limit: int = 30 * 1024,
|
31 |
+
max_stack_limit: int = 10,
|
32 |
+
no_gt: bool = True,
|
33 |
+
execute_api: Optional[str] = None,
|
34 |
+
) -> List[dict]:
|
35 |
+
"""
|
36 |
+
OE-Eval BigCodeBench remote code execution API
|
37 |
+
"""
|
38 |
+
if execute_api is None:
|
39 |
+
execute_api = "http://localhost:9000/evaluate/"
|
40 |
+
|
41 |
+
async with httpx.AsyncClient() as client:
|
42 |
+
params = {
|
43 |
+
"calibrate": calibrate,
|
44 |
+
"parallel": parallel,
|
45 |
+
"min_time_limit": min_time_limit,
|
46 |
+
"max_as_limit": max_as_limit,
|
47 |
+
"max_data_limit": max_data_limit,
|
48 |
+
"max_stack_limit": max_stack_limit,
|
49 |
+
"no_gt": no_gt,
|
50 |
+
}
|
51 |
+
# Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of
|
52 |
+
# generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec)
|
53 |
+
total_timeout = 900
|
54 |
+
response = await client.post(
|
55 |
+
execute_api, json=samples_data, params=params, timeout=total_timeout
|
56 |
+
)
|
57 |
+
results = response.json()
|
58 |
+
|
59 |
+
print("Results received from remote API. Processing ...")
|
60 |
+
check_results = []
|
61 |
+
for doc in results["eval"].values():
|
62 |
+
for rep in doc:
|
63 |
+
rep["tested_completion"] = rep.pop("solution")
|
64 |
+
rep["passed"] = rep.pop("status") == "pass"
|
65 |
+
rep["exec_result"] = rep.pop("details")
|
66 |
+
check_results.append(rep)
|
67 |
+
if check_results:
|
68 |
+
pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results)
|
69 |
+
return check_results, pass_at_1
|
70 |
+
else:
|
71 |
+
return None, None
|
72 |
+
|
73 |
+
def evaluate(sample_file, execute_api: Optional[str] = None):
|
74 |
+
batched_code_test = load_jsonl(sample_file)
|
75 |
+
results, pass_at_1 = asyncio.run(
|
76 |
+
call_oe_eval_bcb_client(
|
77 |
+
samples_data=batched_code_test,
|
78 |
+
calibrate=True,
|
79 |
+
parallel=-1,
|
80 |
+
min_time_limit=30,
|
81 |
+
execute_api = execute_api
|
82 |
+
)
|
83 |
+
)
|
84 |
+
print("pass@1:", pass_at_1)
|
85 |
+
return results
|
86 |
+
|
87 |
+
def main():
|
88 |
+
args = _parser.parse_args()
|
89 |
+
args_dict = vars(args)
|
90 |
+
results = evaluate(args_dict["filename"], args_dict["remoteapi"])
|
91 |
+
save_jsonl("data/eval_results.jsonl", results)
|
92 |
+
|
93 |
+
if __name__ == "__main__":
|
94 |
+
main()
|
prod.sh
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
exec \
|
3 |
gunicorn \
|
4 |
-k uvicorn.workers.UvicornWorker \
|
5 |
-
--workers
|
6 |
--timeout 0 \
|
7 |
--bind 0.0.0.0:7860 \
|
8 |
--enable-stdio-inheritance \
|
|
|
2 |
exec \
|
3 |
gunicorn \
|
4 |
-k uvicorn.workers.UvicornWorker \
|
5 |
+
--workers 8 \
|
6 |
--timeout 0 \
|
7 |
--bind 0.0.0.0:7860 \
|
8 |
--enable-stdio-inheritance \
|