|
"""Plugin Evaluator.""" |
|
|
|
import json |
|
|
|
|
|
class TEvalEvaluator: |
|
"""This module contains the following evaluators for evaluating the |
|
capabilities of the various dimensions of the LLM. |
|
|
|
specifically, InstructEvaluator is used to evaluate the instruction |
|
following capability of LLM, i.e. the ability of the model to perform tool |
|
calls according to an predefined format. ReasoningEvaluator is used to |
|
evaluate the model's ability to reason about the next execution step based |
|
on historical observations. PlanningEvaluator is used to evaluate the |
|
model's ability to plan a solution or program based on a given task. |
|
APIRetrievalEvaluator is used to evaluate the model's ability to retrieve a |
|
subset of tools relevant to the given task from a large number of tools. |
|
ReviewEvaluator is used to evaluate the model's ability to review whether a |
|
task was successfully completed. |
|
""" |
|
|
|
def __init__(self, subset) -> None: |
|
|
|
from opencompass.datasets.teval.evaluators import ( |
|
InstructEvaluator, PlanningEvaluator, |
|
ReasonRetrieveUnderstandEvaluator, ReviewEvaluator) |
|
|
|
super().__init__() |
|
self.subset = subset |
|
if subset == 'instruct': |
|
self.evaluator = InstructEvaluator('') |
|
elif subset == 'plan': |
|
self.evaluator = PlanningEvaluator('') |
|
elif subset == 'review': |
|
self.evaluator = ReviewEvaluator('') |
|
elif subset == 'reason_retrieve_understand': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator('') |
|
elif subset == 'reason': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator( |
|
'', default_prompt_type='str', eval_type='reason') |
|
elif subset == 'retrieve': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator( |
|
'', default_prompt_type='str', eval_type='retrieve') |
|
elif subset == 'understand': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator( |
|
'', default_prompt_type='str', eval_type='understand') |
|
|
|
elif subset == 'instruct_zh': |
|
self.evaluator = InstructEvaluator('') |
|
elif subset == 'plan_zh': |
|
self.evaluator = PlanningEvaluator( |
|
'', bert_score_model='thenlper/gte-large-zh') |
|
elif subset == 'review_zh': |
|
self.evaluator = ReviewEvaluator('') |
|
elif subset == 'reason_retrieve_understand_zh': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator( |
|
'', bert_score_model='thenlper/gte-large-zh') |
|
elif subset == 'reason_zh': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator( |
|
'', |
|
default_prompt_type='str', |
|
eval_type='reason', |
|
bert_score_model='thenlper/gte-large-zh') |
|
elif subset == 'retrieve_zh': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator( |
|
'', default_prompt_type='str', eval_type='retrieve') |
|
elif subset == 'understand_zh': |
|
self.evaluator = ReasonRetrieveUnderstandEvaluator( |
|
'', |
|
default_prompt_type='str', |
|
eval_type='understand', |
|
bert_score_model='thenlper/gte-large-zh') |
|
else: |
|
raise NotImplementedError |
|
|
|
def score(self, predictions, references): |
|
|
|
if len(predictions) != len(references): |
|
return { |
|
'error': 'predictions and references have different ' |
|
'length' |
|
} |
|
|
|
results_list = [] |
|
for prediction, reference in zip(predictions, references): |
|
|
|
datum = json.loads(reference) |
|
datum['prediction'] = prediction |
|
|
|
data_sample = self.evaluator._process_response(datum) |
|
if isinstance(data_sample, tuple): |
|
data_sample = data_sample[0] |
|
metrics_result = self.evaluator._evaluate(data_sample) |
|
results_list.append(metrics_result) |
|
results_dict = self.evaluator._post_process(results_list) |
|
results_dict = {k: v * 100 for k, v in results_dict.items()} |
|
return results_dict |
|
|