Spaces:

jjyang7
/

oe-eval-bcb-lite-evaluator

Running on CPU Upgrade

App Files Files Community

jjyang77 commited on 10 days ago

Commit

bb636ca

•

1 Parent(s): 25db7e9

add local_evaluator and some cleanup

Browse files

Files changed (4) hide show

Dockerfile +2 -0
README.md +8 -2
local_evaluator.py +94 -0
prod.sh +1 -1

Dockerfile CHANGED Viewed

@@ -20,6 +20,8 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser
 RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
 COPY . .
 WORKDIR /

 RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt
+RUN python -m nltk.downloader punkt
 COPY . .
 WORKDIR /

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: OE Eval Bcb Evaluator Testing
 emoji: 🐢
 colorFrom: green
 colorTo: pink
@@ -7,4 +7,10 @@ sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OE Eval Bcb Evaluator lite
 emoji: 🐢
 colorFrom: green
 colorTo: pink
 pinned: false
 ---
+# For local testing
+Build the docker image for the BCB eval env.
+There is a `scikit-image` wheel that takes a long-time to build ...
+Run the container while mounting a data volume with your generated code solutions, and mapping a port to 7860.

local_evaluator.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import json
+import asyncio
+import argparse
+import httpx
+from typing import List, Optional
+_parser = argparse.ArgumentParser()
+_parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl")
+_parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval")
+def load_jsonl(filename):
+    with open(filename, "r") as file:
+        return [json.loads(line.strip()) for line in file]
+def save_jsonl(filename, data):
+    with open(filename, "w") as file:
+        for d in data:
+            file.write(json.dumps(d))
+            file.write("\n")
+    return filename
+async def call_oe_eval_bcb_client(
+    samples_data: List[dict],
+    calibrate: bool = True,
+    parallel: int = -1,
+    min_time_limit: float = 1,
+    max_as_limit: int = 30 * 1024,
+    max_data_limit: int = 30 * 1024,
+    max_stack_limit: int = 10,
+    no_gt: bool = True,
+    execute_api: Optional[str] = None,
+) -> List[dict]:
+    """
+    OE-Eval BigCodeBench remote code execution API
+    """
+    if execute_api is None:
+        execute_api = "http://localhost:9000/evaluate/"
+    async with httpx.AsyncClient() as client:
+        params = {
+            "calibrate": calibrate,
+            "parallel": parallel,
+            "min_time_limit": min_time_limit,
+            "max_as_limit": max_as_limit,
+            "max_data_limit": max_data_limit,
+            "max_stack_limit": max_stack_limit,
+            "no_gt": no_gt,
+        }
+        # Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of
+        #  generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec)
+        total_timeout = 900
+        response = await client.post(
+            execute_api, json=samples_data, params=params, timeout=total_timeout
+        )
+        results = response.json()
+    print("Results received from remote API. Processing ...")
+    check_results = []
+    for doc in results["eval"].values():
+        for rep in doc:
+            rep["tested_completion"] = rep.pop("solution")
+            rep["passed"] = rep.pop("status") == "pass"
+            rep["exec_result"] = rep.pop("details")
+            check_results.append(rep)
+    if check_results:
+        pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results)
+        return check_results, pass_at_1
+    else:
+        return None, None
+def evaluate(sample_file, execute_api: Optional[str] = None):
+    batched_code_test = load_jsonl(sample_file)
+    results, pass_at_1 = asyncio.run(
+        call_oe_eval_bcb_client(
+            samples_data=batched_code_test,
+            calibrate=True,
+            parallel=-1,
+            min_time_limit=30,
+            execute_api = execute_api
+        )
+    )
+    print("pass@1:", pass_at_1)
+    return results
+def main():
+    args = _parser.parse_args()
+    args_dict = vars(args)
+    results = evaluate(args_dict["filename"], args_dict["remoteapi"])
+    save_jsonl("data/eval_results.jsonl", results)
+if __name__ == "__main__":
+    main()

prod.sh CHANGED Viewed

@@ -2,7 +2,7 @@
 exec \
     gunicorn \
     -k uvicorn.workers.UvicornWorker \
-    --workers 2 \
     --timeout 0 \
     --bind 0.0.0.0:7860 \
     --enable-stdio-inheritance \

 exec \
     gunicorn \
     -k uvicorn.workers.UvicornWorker \
+    --workers 8 \
     --timeout 0 \
     --bind 0.0.0.0:7860 \
     --enable-stdio-inheritance \