Spaces:

jjyang7
/

bcb_evaluator_testing

Running

jjyang77 commited on 12 days ago

Commit

da384b4

•

1 Parent(s): 0f87dc1

update samples input from file to data list

Files changed (4) hide show

.gitignore CHANGED Viewed

@@ -2,6 +2,8 @@
 **.pyc
 **/__pycache__
 # Testing data
 /data

 **.pyc
 **/__pycache__
+.hypothesis/
 # Testing data
 /data

Dockerfile CHANGED Viewed

@@ -21,7 +21,7 @@ RUN pip install --upgrade pip
 # Pre-install the dataset
 #RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench()"
-RUN pip install fastapi gunicorn uvicorn[standard] httpx #pydantic==2.*
 RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt

 # Pre-install the dataset
 #RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench()"
+RUN pip install fastapi gunicorn uvicorn[standard] httpx pydantic==2.*
 RUN pip install -I --timeout 2000 -r https://github.com/bigcode-project/bigcodebench-annotation/releases/download/v0.1.0/requirements.txt

api/app.py CHANGED Viewed

@@ -7,6 +7,8 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
 from typing import Dict, List, Tuple
 import gc
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
@@ -15,6 +17,14 @@ from api.code_execution import untrusted_check
 Result = Tuple[str, List[bool]]
 def create_app() -> FastAPI:
     level = os.environ.get("LOG_LEVEL", default=logging.INFO)
@@ -33,7 +43,8 @@ def create_app() -> FastAPI:
     @app.post("/evaluate/")
     async def evaluate(
-        samples: str,
         parallel: int = -1,
         min_time_limit: float = 1,
         max_as_limit: int = 30 * 1024,
@@ -42,7 +53,7 @@ def create_app() -> FastAPI:
         no_gt: bool = True,
     ) -> dict:
         """
-        Evaluate the correctness of the solutions in the given samples file.
         """
         if parallel < 1:
             n_workers = max(1, multiprocessing.cpu_count() // 2)
@@ -71,7 +82,7 @@ def create_app() -> FastAPI:
                 solution = sample["solution"]
-                if "sanitized-calibrated" in samples:
                     solution = sample["code_prompt"] + "\n    pass\n" + solution
                 remainings.add(sample["_identifier"])
                 args = (

 from typing import Dict, List, Tuple
 import gc
+from pydantic import BaseModel
 from fastapi import FastAPI
 from fastapi.responses import RedirectResponse
 Result = Tuple[str, List[bool]]
+class SampleDate(BaseModel):
+    task_id: str
+    solution: str
+    code_prompt: str
+    test: str
+    entry_point: str
+    res_id: int
 def create_app() -> FastAPI:
     level = os.environ.get("LOG_LEVEL", default=logging.INFO)
     @app.post("/evaluate/")
     async def evaluate(
+        samples: List[SampleDate],
+        calibrate: bool = True,
         parallel: int = -1,
         min_time_limit: float = 1,
         max_as_limit: int = 30 * 1024,
         no_gt: bool = True,
     ) -> dict:
         """
+        Evaluate the correctness of the solutions in the given samples data.
         """
         if parallel < 1:
             n_workers = max(1, multiprocessing.cpu_count() // 2)
                 solution = sample["solution"]
+                if calibrate:
                     solution = sample["code_prompt"] + "\n    pass\n" + solution
                 remainings.add(sample["_identifier"])
                 args = (

api/bigcodebench_data.py CHANGED Viewed

@@ -20,27 +20,11 @@ def stream_jsonl(filename: str) -> Iterable[Dict]:
                     yield json.loads(line)
-def load_solutions(sample_path: os.PathLike) -> Iterable[Dict]:
-    """We accept two formats of inputs.
-    + `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}.
-    + A folder which contains sub-folders named after the task_id. Each sub-folder
-    contains samples named in `[?].py` where `?` is the solution id starting with 0.
-    Different from `sample.jsonl`, the solutions must be complete (with prompt prefix).
     """
-    # if it is a file
-    if os.path.isfile(sample_path):
-        for i, sample in enumerate(stream_jsonl(sample_path)):
-            assert "task_id" in sample, "No task_id found in sample!"
-            assert "res_id" in sample, "No res_id found in sample!"
-            assert "test" in sample, "No test found in sample!"
-            assert "solution" in sample, "No solution found in sample!"
-            assert isinstance(
-                sample["solution"], str
-            ), "Solution must be a string! If you have multiple solutions, please repeat the task_id."
-            sample["_identifier"] = (
-                sample["task_id"] + f" (line {i+1} in {sample_path})"
-            )
-            yield sample
-    else:
-        raise NotImplementedError("Only jsonl solution output file is supported for now.")

                     yield json.loads(line)
+def load_solutions(samples) -> Iterable[Dict]:
     """
+    """
+    for i, sample in enumerate(samples):
+        sample["_identifier"] = (
+            sample["task_id"] + f" (line {i+1} )"
+        )
+        yield sample