|
import json |
|
|
|
from datasets import Dataset |
|
|
|
from opencompass.openicl.icl_evaluator import BaseEvaluator |
|
from opencompass.registry import LOAD_DATASET |
|
|
|
from ..base import BaseDataset |
|
from .evaluation_main import (InputExample, test_instruction_following_loose, |
|
test_instruction_following_strict) |
|
|
|
|
|
@LOAD_DATASET.register_module() |
|
class IFEvalDataset(BaseDataset): |
|
|
|
@staticmethod |
|
def load(path): |
|
datasets = [] |
|
with open(path, 'r', encoding='utf-8') as file: |
|
for line in file: |
|
tmp = json.loads(line.strip()) |
|
dataset = dict(prompt=tmp['prompt'], reference=tmp) |
|
datasets.append(dataset) |
|
return Dataset.from_list(datasets) |
|
|
|
|
|
class IFEvaluator(BaseEvaluator): |
|
|
|
def score(self, predictions, references): |
|
results = dict() |
|
for metric in ('strict', 'loose'): |
|
results[metric] = [] |
|
for pred, refer in zip(predictions, references): |
|
input = InputExample( |
|
key=refer['key'], |
|
instruction_id_list=refer['instruction_id_list'], |
|
prompt=refer['prompt'], |
|
kwargs=refer['kwargs']) |
|
for kwarg in input.kwargs: |
|
for k in list(kwarg.keys()): |
|
if kwarg[k] is None: |
|
kwarg.pop(k, None) |
|
results['strict'].append( |
|
test_instruction_following_strict(input, pred)) |
|
results['loose'].append( |
|
test_instruction_following_loose(input, pred)) |
|
final_scores = dict() |
|
for metric in ('strict', 'loose'): |
|
prompt_total = 0 |
|
prompt_correct = 0 |
|
inst_total = 0 |
|
inst_correct = 0 |
|
|
|
for example in results[metric]: |
|
follow_instruction_list = example.follow_instruction_list |
|
instruction_id_list = example.instruction_id_list |
|
|
|
prompt_total += 1 |
|
if all(follow_instruction_list): |
|
prompt_correct += 1 |
|
|
|
inst_total += len(instruction_id_list) |
|
inst_correct += sum(follow_instruction_list) |
|
prompt_score = f'Prompt-level-{metric}-accuracy' |
|
inst_score = f'Inst-level-{metric}-accuracy' |
|
final_scores[prompt_score] = prompt_correct / prompt_total * 100 |
|
final_scores[inst_score] = inst_correct / inst_total * 100 |
|
return final_scores |
|
|