oe-eval-bcb-lite-evaluator / local_evaluator.py
jjyang77
add local_evaluator and some cleanup
bb636ca
raw
history blame
3.05 kB
import json
import asyncio
import argparse
import httpx
from typing import List, Optional
_parser = argparse.ArgumentParser()
_parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl")
_parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval")
def load_jsonl(filename):
with open(filename, "r") as file:
return [json.loads(line.strip()) for line in file]
def save_jsonl(filename, data):
with open(filename, "w") as file:
for d in data:
file.write(json.dumps(d))
file.write("\n")
return filename
async def call_oe_eval_bcb_client(
samples_data: List[dict],
calibrate: bool = True,
parallel: int = -1,
min_time_limit: float = 1,
max_as_limit: int = 30 * 1024,
max_data_limit: int = 30 * 1024,
max_stack_limit: int = 10,
no_gt: bool = True,
execute_api: Optional[str] = None,
) -> List[dict]:
"""
OE-Eval BigCodeBench remote code execution API
"""
if execute_api is None:
execute_api = "http://localhost:9000/evaluate/"
async with httpx.AsyncClient() as client:
params = {
"calibrate": calibrate,
"parallel": parallel,
"min_time_limit": min_time_limit,
"max_as_limit": max_as_limit,
"max_data_limit": max_data_limit,
"max_stack_limit": max_stack_limit,
"no_gt": no_gt,
}
# Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of
# generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec)
total_timeout = 900
response = await client.post(
execute_api, json=samples_data, params=params, timeout=total_timeout
)
results = response.json()
print("Results received from remote API. Processing ...")
check_results = []
for doc in results["eval"].values():
for rep in doc:
rep["tested_completion"] = rep.pop("solution")
rep["passed"] = rep.pop("status") == "pass"
rep["exec_result"] = rep.pop("details")
check_results.append(rep)
if check_results:
pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results)
return check_results, pass_at_1
else:
return None, None
def evaluate(sample_file, execute_api: Optional[str] = None):
batched_code_test = load_jsonl(sample_file)
results, pass_at_1 = asyncio.run(
call_oe_eval_bcb_client(
samples_data=batched_code_test,
calibrate=True,
parallel=-1,
min_time_limit=30,
execute_api = execute_api
)
)
print("pass@1:", pass_at_1)
return results
def main():
args = _parser.parse_args()
args_dict = vars(args)
results = evaluate(args_dict["filename"], args_dict["remoteapi"])
save_jsonl("data/eval_results.jsonl", results)
if __name__ == "__main__":
main()