import json import asyncio import argparse import httpx from typing import List, Optional _parser = argparse.ArgumentParser() _parser.add_argument("--filename", type=str, help="filename like data/codgen-...jsonl") _parser.add_argument("--remoteapi", type=str, help="remote execution API if not running local eval") def load_jsonl(filename): with open(filename, "r") as file: return [json.loads(line.strip()) for line in file] def save_jsonl(filename, data): with open(filename, "w") as file: for d in data: file.write(json.dumps(d)) file.write("\n") return filename async def call_oe_eval_bcb_client( samples_data: List[dict], calibrate: bool = True, parallel: int = -1, min_time_limit: float = 1, max_as_limit: int = 30 * 1024, max_data_limit: int = 30 * 1024, max_stack_limit: int = 10, no_gt: bool = True, execute_api: Optional[str] = None, ) -> List[dict]: """ OE-Eval BigCodeBench remote code execution API """ if execute_api is None: execute_api = "http://localhost:9000/evaluate/" async with httpx.AsyncClient() as client: params = { "calibrate": calibrate, "parallel": parallel, "min_time_limit": min_time_limit, "max_as_limit": max_as_limit, "max_data_limit": max_data_limit, "max_stack_limit": max_stack_limit, "no_gt": no_gt, } # Even for the Full BCB dataset, total execution time should not exceed 5-10 min unless many instances of # generated codes are particularly mal-formed or slow. (per instance exec timeout is 30 sec) total_timeout = 900 response = await client.post( execute_api, json=samples_data, params=params, timeout=total_timeout ) results = response.json() print("Results received from remote API. Processing ...") check_results = [] for doc in results["eval"].values(): for rep in doc: rep["tested_completion"] = rep.pop("solution") rep["passed"] = rep.pop("status") == "pass" rep["exec_result"] = rep.pop("details") check_results.append(rep) if check_results: pass_at_1 = sum([rep["passed"] for rep in check_results])/len(check_results) return check_results, pass_at_1 else: return None, None def evaluate(sample_file, execute_api: Optional[str] = None): batched_code_test = load_jsonl(sample_file) results, pass_at_1 = asyncio.run( call_oe_eval_bcb_client( samples_data=batched_code_test, calibrate=True, parallel=-1, min_time_limit=30, execute_api = execute_api ) ) print("pass@1:", pass_at_1) return results def main(): args = _parser.parse_args() args_dict = vars(args) results = evaluate(args_dict["filename"], args_dict["remoteapi"]) save_jsonl("data/eval_results.jsonl", results) if __name__ == "__main__": main()