# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Segmentation scores evaluation metrics""" import evaluate import datasets # TODO: Add BibTeX citation _CITATION = """\ @InProceedings{huggingface:module, title = {A great new module}, authors={huggingface, Inc.}, year={2020} } """ # TODO: Add description of the module here _DESCRIPTION = """\ This module computes segmentation scores for a list of predicted segmentations and gold segmentations. """ # TODO: Add description of the arguments of the module here _KWARGS_DESCRIPTION = """ Calculates how good are predicted segmentations, using boundary, token and type scores. Args: predictions: list of segmented utterances to score. Each predictions should be a string with phonemes separated by spaces and estimated word boundaries denoted by the token ';eword'. references: list of segmented utterances to score. Each predictions should be a string with phonemes separated by spaces and gold word boundaries denoted by the token ';eword'. Returns: type_fscore: lexicon f1 score type_precision: lexicon precision type_recall: lexicon recall token_fscore: token f1 score token_precision: token precision token_recall: token recall boundary_all_fscore: boundary f1 score, including utterance boundaries boundary_all_precision: boundary precision, including utterance boundaries boundary_all_recall: boundary recall, including utterance boundaries boundary_noedge_fscore: boundary f1 score, excluding utterance boundaries boundary_noedge_precision: boundary precision, excluding utterance boundaries boundary_noedge_recall: boundary recall, excluding utterance boundaries Examples: >>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores") >>> results = segmentation_scores.compute(references=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"], predictions=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"]) >>> print(results) {'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0} """ class TokenEvaluation(object): """Evaluation of token f-score, precision and recall""" def __init__(self): self.test = 0 self.gold = 0 self.correct = 0 self.n = 0 self.n_exactmatch = 0 def precision(self): return float(self.correct) / self.test if self.test != 0 else None def recall(self): return float(self.correct) / self.gold if self.gold != 0 else None def fscore(self): total = self.test + self.gold return float(2 * self.correct) / total if total != 0 else None def exact_match(self): return float(self.n_exactmatch) / self.n if self.n else None def update(self, test_set, gold_set): self.n += 1 if test_set == gold_set: self.n_exactmatch += 1 # omit empty items for type scoring (should not affect token # scoring). Type lists are prepared with '_' where there is no # match, to keep list lengths the same self.test += len([x for x in test_set if x != "_"]) self.gold += len([x for x in gold_set if x != "_"]) self.correct += len(test_set & gold_set) def update_lists(self, test_sets, gold_sets): if len(test_sets) != len(gold_sets): raise ValueError( "#words different in test and gold: {} != {}".format( len(test_sets), len(gold_sets) ) ) for t, g in zip(test_sets, gold_sets): self.update(t, g) class TypeEvaluation(TokenEvaluation): """Evaluation of type f-score, precision and recall""" @staticmethod def lexicon_check(textlex, goldlex): """Compare hypothesis and gold lexicons""" textlist = [] goldlist = [] for w in textlex: if w in goldlex: # set up matching lists for the true positives textlist.append(w) goldlist.append(w) else: # false positives textlist.append(w) # ensure matching null element in text list goldlist.append("_") for w in goldlex: if w not in goldlist: # now for the false negatives goldlist.append(w) # ensure matching null element in text list textlist.append("_") textset = [{w} for w in textlist] goldset = [{w} for w in goldlist] return textset, goldset def update_lists(self, text, gold): lt, lg = self.lexicon_check(text, gold) super(TypeEvaluation, self).update_lists(lt, lg) class BoundaryEvaluation(TokenEvaluation): @staticmethod def get_boundary_positions(stringpos): return [{idx for pair in line for idx in pair} for line in stringpos] def update_lists(self, text, gold): lt = self.get_boundary_positions(text) lg = self.get_boundary_positions(gold) super(BoundaryEvaluation, self).update_lists(lt, lg) class BoundaryNoEdgeEvaluation(BoundaryEvaluation): @staticmethod def get_boundary_positions(stringpos): return [{left for left, _ in line if left > 0} for line in stringpos] class _StringPos(object): """Compute start and stop index of words in an utterance""" def __init__(self): self.idx = 0 def __call__(self, n): """Return the position of the current word given its length `n`""" start = self.idx self.idx += n return start, self.idx @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class segmentation_scores(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features({ 'predictions': datasets.Value('string'), 'references': datasets.Value('string'), }), # Homepage of the module for documentation homepage="https://huggingface.co/spaces/transformersegmentation/segmentation_scores", # Additional links to the codebase or references codebase_urls=["http://github.com/codebyzeb/transformersegmentation"], reference_urls=["http://path.to.reference.url/new_module"] ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" # TODO: Download external resources if needed pass def _process_data(self, text): """ Load text data for evaluation Parameters ---------- text : list of str The list of utterances to read for the evaluation. Returns ------- (words, positions, lexicon) : three lists where `words` are the input utterances with word separators removed, `positions` stores the start/stop index of each word for each utterance, and `lexicon` is the list of words. """ words = [] positions = [] lexicon = {} # ignore empty lines for utt in (utt for utt in text if utt.strip()): # list of phones in the utterance with word seperator removed phone_in_utterance = [ phone for phone in utt.split(" ") if phone != ";eword" ] words_in_utterance = ( "".join( " " if phone == ";eword" else phone for phone in utt.split(" ") ) .strip() .split(" ") ) words.append(phone_in_utterance) for word in words_in_utterance: lexicon[word] = 1 idx = _StringPos() positions.append({idx(len(word)) for word in words_in_utterance}) # return the words lexicon as a sorted list lexicon = sorted([k for k in lexicon.keys()]) return words, positions, lexicon def _compute(self, predictions, references): """Scores a segmented text against its gold version Parameters ---------- predictions : sequence of str A suite of word utterances, each string using ';eword' as as word separator. references : sequence of str A suite of word utterances, each string using ';eword' as as word separator. Returns ------- scores : dict A dictionary with the following entries: * 'type_fscore' * 'type_precision' * 'type_recall' * 'token_fscore' * 'token_precision' * 'token_recall' * 'boundary_all_fscore' * 'boundary_all_precision' * 'boundary_all_recall' * 'boundary_noedge_fscore' * 'boundary_noedge_precision' * 'boundary_noedge_recall' Raises ------ ValueError If `gold` and `text` have different size or differ in tokens """ text_words, text_stringpos, text_lex = self._process_data(predictions) gold_words, gold_stringpos, gold_lex = self._process_data(references) if len(gold_words) != len(text_words): raise ValueError( "gold and train have different size: len(gold)={}, len(train)={}".format( len(gold_words), len(text_words) ) ) for i, (g, t) in enumerate(zip(gold_words, text_words)): if g != t: raise ValueError( 'gold and train differ at line {}: gold="{}", train="{}"'.format( i + 1, g, t ) ) # token evaluation token_eval = TokenEvaluation() token_eval.update_lists(text_stringpos, gold_stringpos) # type evaluation type_eval = TypeEvaluation() type_eval.update_lists(text_lex, gold_lex) # boundary evaluation (with edges) boundary_eval = BoundaryEvaluation() boundary_eval.update_lists(text_stringpos, gold_stringpos) # boundary evaluation (no edges) boundary_noedge_eval = BoundaryNoEdgeEvaluation() boundary_noedge_eval.update_lists(text_stringpos, gold_stringpos) return { "token_precision": token_eval.precision(), "token_recall": token_eval.recall(), "token_fscore": token_eval.fscore(), "type_precision": type_eval.precision(), "type_recall": type_eval.recall(), "type_fscore": type_eval.fscore(), "boundary_all_precision": boundary_eval.precision(), "boundary_all_recall": boundary_eval.recall(), "boundary_all_fscore": boundary_eval.fscore(), "boundary_noedge_precision": boundary_noedge_eval.precision(), "boundary_noedge_recall": boundary_noedge_eval.recall(), "boundary_noedge_fscore": boundary_noedge_eval.fscore(), }