|
from typing import List, Optional, Union |
|
|
|
from opencompass.openicl.icl_evaluator import BaseEvaluator |
|
from opencompass.registry import LOAD_DATASET |
|
|
|
from .ds1000 import DS1000Dataset |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class DS1000Dataset_Interperter(DS1000Dataset): |
|
"""Code interpreter version of DS1000.""" |
|
|
|
def load( |
|
self, |
|
path: str, |
|
libs: Optional[Union[str, list]] = None, |
|
mode: str = 'Insertion', |
|
): |
|
dataset = super().load(path, libs, mode) |
|
|
|
def preprocess(example): |
|
"""Get rid of unnecessary code block in prompt.""" |
|
prompt = example.pop('prompt') |
|
example['prompt'] = prompt[:prompt.find('A:\n')].strip() |
|
return example |
|
|
|
dataset = dataset.map(preprocess) |
|
return dataset |
|
|
|
|
|
class DS1000InterpreterEvaluator(BaseEvaluator): |
|
"""DS1000 interpreter evaluator. |
|
|
|
Args: |
|
action (str): Action for catching internal prediction. |
|
Defaults to `PythonInterpreter`. |
|
""" |
|
|
|
def __init__(self, action: str = 'PythonInterpreter'): |
|
self.action = action |
|
|
|
def get_action(self, step): |
|
for s in step[::-1]: |
|
if s['type'] == self.action: |
|
return s |
|
|
|
def score(self, predictions: List, references: List, steps: List): |
|
"""Calculate accuracy.""" |
|
|
|
action_scope = 0 |
|
follow_scope = 0 |
|
soft_success = 0 |
|
success = 0 |
|
total = len(references) |
|
for step in steps: |
|
s = self.get_action(step) |
|
if s: |
|
action_scope += 1 |
|
if not s['errmsg']: |
|
soft_success += 1 |
|
|
|
|
|
if s['args'] and 'assert' in s['args']['text']: |
|
follow_scope += 1 |
|
|
|
if s['result']: |
|
success += s['result']['text'] == 'True' |
|
|
|
result = dict( |
|
action_pct=100 * action_scope / total, |
|
soft_code_acc=100 * soft_success / total, |
|
follow_acc=100 * follow_scope / total, |
|
code_acc=100 * success / total, |
|
) |
|
return result |
|
|