# Copyright 2020 The HuggingFace Evaluate Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """The SuperGLUE benchmark metric.""" import datasets from sklearn.metrics import f1_score, matthews_corrcoef import evaluate from .record_evaluation import evaluate as evaluate_record _CITATION = """\ @article{wang2019superglue, title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems}, author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R}, journal={arXiv preprint arXiv:1905.00537}, year={2019} } """ _DESCRIPTION = """\ SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after GLUE with a new set of more difficult language understanding tasks, improved resources, and a new public leaderboard. """ _KWARGS_DESCRIPTION = """ Compute SuperGLUE evaluation metric associated to each SuperGLUE dataset. Args: predictions: list of predictions to score. Depending on the SuperGlUE subset: - for 'record': list of question-answer dictionaries with the following keys: - 'idx': index of the question as specified by the dataset - 'prediction_text': the predicted answer text - for 'multirc': list of question-answer dictionaries with the following keys: - 'idx': index of the question-answer pair as specified by the dataset - 'prediction': the predicted answer label - otherwise: list of predicted labels references: list of reference labels. Depending on the SuperGLUE subset: - for 'record': list of question-answers dictionaries with the following keys: - 'idx': index of the question as specified by the dataset - 'answers': list of possible answers - otherwise: list of reference labels Returns: depending on the SuperGLUE subset: - for 'record': - 'exact_match': Exact match between answer and gold answer - 'f1': F1 score - for 'multirc': - 'exact_match': Exact match between answer and gold answer - 'f1_m': Per-question macro-F1 score - 'f1_a': Average F1 score over all answers - for 'axb': 'matthews_correlation': Matthew Correlation - for 'cb': - 'accuracy': Accuracy - 'f1': F1 score - for all others: - 'accuracy': Accuracy Examples: >>> super_glue_metric = evaluate.load('super_glue', 'copa') # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"] >>> predictions = [0, 1] >>> references = [0, 1] >>> results = super_glue_metric.compute(predictions=predictions, references=references) >>> print(results) {'accuracy': 1.0} >>> super_glue_metric = evaluate.load('super_glue', 'cb') >>> predictions = [0, 1] >>> references = [0, 1] >>> results = super_glue_metric.compute(predictions=predictions, references=references) >>> print(results) {'accuracy': 1.0, 'f1': 1.0} >>> super_glue_metric = evaluate.load('super_glue', 'record') >>> predictions = [{'idx': {'passage': 0, 'query': 0}, 'prediction_text': 'answer'}] >>> references = [{'idx': {'passage': 0, 'query': 0}, 'answers': ['answer', 'another_answer']}] >>> results = super_glue_metric.compute(predictions=predictions, references=references) >>> print(results) {'exact_match': 1.0, 'f1': 1.0} >>> super_glue_metric = evaluate.load('super_glue', 'multirc') >>> predictions = [{'idx': {'answer': 0, 'paragraph': 0, 'question': 0}, 'prediction': 0}, {'idx': {'answer': 1, 'paragraph': 2, 'question': 3}, 'prediction': 1}] >>> references = [0, 1] >>> results = super_glue_metric.compute(predictions=predictions, references=references) >>> print(results) {'exact_match': 1.0, 'f1_m': 1.0, 'f1_a': 1.0} >>> super_glue_metric = evaluate.load('super_glue', 'axb') >>> references = [0, 1] >>> predictions = [0, 1] >>> results = super_glue_metric.compute(predictions=predictions, references=references) >>> print(results) {'matthews_correlation': 1.0} """ def simple_accuracy(preds, labels): return float((preds == labels).mean()) def acc_and_f1(preds, labels, f1_avg="binary"): acc = simple_accuracy(preds, labels) f1 = float(f1_score(y_true=labels, y_pred=preds, average=f1_avg)) return { "accuracy": acc, "f1": f1, } def evaluate_multirc(ids_preds, labels): """ Computes F1 score and Exact Match for MultiRC predictions. """ question_map = {} for id_pred, label in zip(ids_preds, labels): question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}' pred = id_pred["prediction"] if question_id in question_map: question_map[question_id].append((pred, label)) else: question_map[question_id] = [(pred, label)] f1s, ems = [], [] for question, preds_labels in question_map.items(): question_preds, question_labels = zip(*preds_labels) f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro") f1s.append(f1) em = int(sum(p == l for p, l in preds_labels) == len(preds_labels)) ems.append(em) f1_m = float(sum(f1s) / len(f1s)) em = sum(ems) / len(ems) f1_a = float(f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds])) return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a} @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class SuperGlue(evaluate.Metric): def _info(self): if self.config_name not in [ "boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg", ]: raise KeyError( "You should supply a configuration name selected in " '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]' ) return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features(self._get_feature_types()), codebase_urls=[], reference_urls=[], format="numpy" if not self.config_name == "record" and not self.config_name == "multirc" else None, ) def _get_feature_types(self): if self.config_name == "record": return { "predictions": { "idx": { "passage": datasets.Value("int64"), "query": datasets.Value("int64"), }, "prediction_text": datasets.Value("string"), }, "references": { "idx": { "passage": datasets.Value("int64"), "query": datasets.Value("int64"), }, "answers": datasets.Sequence(datasets.Value("string")), }, } elif self.config_name == "multirc": return { "predictions": { "idx": { "answer": datasets.Value("int64"), "paragraph": datasets.Value("int64"), "question": datasets.Value("int64"), }, "prediction": datasets.Value("int64"), }, "references": datasets.Value("int64"), } else: return { "predictions": datasets.Value("int64"), "references": datasets.Value("int64"), } def _compute(self, predictions, references): if self.config_name == "axb": return {"matthews_correlation": matthews_corrcoef(references, predictions)} elif self.config_name == "cb": return acc_and_f1(predictions, references, f1_avg="macro") elif self.config_name == "record": dataset = [ { "qas": [ {"id": ref["idx"]["query"], "answers": [{"text": ans} for ans in ref["answers"]]} for ref in references ] } ] predictions = {pred["idx"]["query"]: pred["prediction_text"] for pred in predictions} return evaluate_record(dataset, predictions)[0] elif self.config_name == "multirc": return evaluate_multirc(predictions, references) elif self.config_name in ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]: return {"accuracy": simple_accuracy(predictions, references)} else: raise KeyError( "You should supply a configuration name selected in " '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]' )