Spaces:
Runtime error
Runtime error
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" Segmentation scores evaluation metrics""" | |
import evaluate | |
import datasets | |
# TODO: Add BibTeX citation | |
_CITATION = """\ | |
@InProceedings{huggingface:module, | |
title = {A great new module}, | |
authors={huggingface, Inc.}, | |
year={2020} | |
} | |
""" | |
# TODO: Add description of the module here | |
_DESCRIPTION = """\ | |
This module computes segmentation scores for a list of predicted segmentations and gold segmentations. | |
""" | |
# TODO: Add description of the arguments of the module here | |
_KWARGS_DESCRIPTION = """ | |
Calculates how good are predicted segmentations, using boundary, token and type scores. | |
Args: | |
predictions: list of segmented utterances to score. Each predictions | |
should be a string with phonemes separated by spaces and estimated word boundaries | |
denoted by the token ';eword'. | |
references: list of segmented utterances to score. Each predictions | |
should be a string with phonemes separated by spaces and gold word boundaries | |
denoted by the token ';eword'. | |
Returns: | |
type_fscore: lexicon f1 score | |
type_precision: lexicon precision | |
type_recall: lexicon recall | |
token_fscore: token f1 score | |
token_precision: token precision | |
token_recall: token recall | |
boundary_all_fscore: boundary f1 score, including utterance boundaries | |
boundary_all_precision: boundary precision, including utterance boundaries | |
boundary_all_recall: boundary recall, including utterance boundaries | |
boundary_noedge_fscore: boundary f1 score, excluding utterance boundaries | |
boundary_noedge_precision: boundary precision, excluding utterance boundaries | |
boundary_noedge_recall: boundary recall, excluding utterance boundaries | |
Examples: | |
>>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores") | |
>>> results = segmentation_scores.compute(references=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"], predictions=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"]) | |
>>> print(results) | |
{'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0} | |
""" | |
class TokenEvaluation(object): | |
"""Evaluation of token f-score, precision and recall""" | |
def __init__(self): | |
self.test = 0 | |
self.gold = 0 | |
self.correct = 0 | |
self.n = 0 | |
self.n_exactmatch = 0 | |
def precision(self): | |
return float(self.correct) / self.test if self.test != 0 else None | |
def recall(self): | |
return float(self.correct) / self.gold if self.gold != 0 else None | |
def fscore(self): | |
total = self.test + self.gold | |
return float(2 * self.correct) / total if total != 0 else None | |
def exact_match(self): | |
return float(self.n_exactmatch) / self.n if self.n else None | |
def update(self, test_set, gold_set): | |
self.n += 1 | |
if test_set == gold_set: | |
self.n_exactmatch += 1 | |
# omit empty items for type scoring (should not affect token | |
# scoring). Type lists are prepared with '_' where there is no | |
# match, to keep list lengths the same | |
self.test += len([x for x in test_set if x != "_"]) | |
self.gold += len([x for x in gold_set if x != "_"]) | |
self.correct += len(test_set & gold_set) | |
def update_lists(self, test_sets, gold_sets): | |
if len(test_sets) != len(gold_sets): | |
raise ValueError( | |
"#words different in test and gold: {} != {}".format( | |
len(test_sets), len(gold_sets) | |
) | |
) | |
for t, g in zip(test_sets, gold_sets): | |
self.update(t, g) | |
class TypeEvaluation(TokenEvaluation): | |
"""Evaluation of type f-score, precision and recall""" | |
def lexicon_check(textlex, goldlex): | |
"""Compare hypothesis and gold lexicons""" | |
textlist = [] | |
goldlist = [] | |
for w in textlex: | |
if w in goldlex: | |
# set up matching lists for the true positives | |
textlist.append(w) | |
goldlist.append(w) | |
else: | |
# false positives | |
textlist.append(w) | |
# ensure matching null element in text list | |
goldlist.append("_") | |
for w in goldlex: | |
if w not in goldlist: | |
# now for the false negatives | |
goldlist.append(w) | |
# ensure matching null element in text list | |
textlist.append("_") | |
textset = [{w} for w in textlist] | |
goldset = [{w} for w in goldlist] | |
return textset, goldset | |
def update_lists(self, text, gold): | |
lt, lg = self.lexicon_check(text, gold) | |
super(TypeEvaluation, self).update_lists(lt, lg) | |
class BoundaryEvaluation(TokenEvaluation): | |
def get_boundary_positions(stringpos): | |
return [{idx for pair in line for idx in pair} for line in stringpos] | |
def update_lists(self, text, gold): | |
lt = self.get_boundary_positions(text) | |
lg = self.get_boundary_positions(gold) | |
super(BoundaryEvaluation, self).update_lists(lt, lg) | |
class BoundaryNoEdgeEvaluation(BoundaryEvaluation): | |
def get_boundary_positions(stringpos): | |
return [{left for left, _ in line if left > 0} for line in stringpos] | |
class _StringPos(object): | |
"""Compute start and stop index of words in an utterance""" | |
def __init__(self): | |
self.idx = 0 | |
def __call__(self, n): | |
"""Return the position of the current word given its length `n`""" | |
start = self.idx | |
self.idx += n | |
return start, self.idx | |
class segmentation_scores(evaluate.Metric): | |
"""TODO: Short description of my evaluation module.""" | |
def _info(self): | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features({ | |
'predictions': datasets.Value('string'), | |
'references': datasets.Value('string'), | |
}), | |
# Homepage of the module for documentation | |
homepage="https://huggingface.co/spaces/transformersegmentation/segmentation_scores", | |
# Additional links to the codebase or references | |
codebase_urls=["http://github.com/codebyzeb/transformersegmentation"], | |
reference_urls=["http://path.to.reference.url/new_module"] | |
) | |
def _download_and_prepare(self, dl_manager): | |
"""Optional: download external resources useful to compute the scores""" | |
# TODO: Download external resources if needed | |
pass | |
def _process_data(self, text): | |
""" Load text data for evaluation | |
Parameters | |
---------- | |
text : list of str | |
The list of utterances to read for the evaluation. | |
Returns | |
------- | |
(words, positions, lexicon) : three lists | |
where `words` are the input utterances with word separators | |
removed, `positions` stores the start/stop index of each word | |
for each utterance, and `lexicon` is the list of words. | |
""" | |
words = [] | |
positions = [] | |
lexicon = {} | |
# ignore empty lines | |
for utt in (utt for utt in text if utt.strip()): | |
# list of phones in the utterance with word seperator removed | |
phone_in_utterance = [ | |
phone for phone in utt.split(" ") if phone != ";eword" | |
] | |
words_in_utterance = ( | |
"".join( | |
" " if phone == ";eword" else phone for phone in utt.split(" ") | |
) | |
.strip() | |
.split(" ") | |
) | |
words.append(phone_in_utterance) | |
for word in words_in_utterance: | |
lexicon[word] = 1 | |
idx = _StringPos() | |
positions.append({idx(len(word)) for word in words_in_utterance}) | |
# return the words lexicon as a sorted list | |
lexicon = sorted([k for k in lexicon.keys()]) | |
return words, positions, lexicon | |
def _compute(self, predictions, references): | |
"""Scores a segmented text against its gold version | |
Parameters | |
---------- | |
predictions : sequence of str | |
A suite of word utterances, each string using ';eword' as as word separator. | |
references : sequence of str | |
A suite of word utterances, each string using ';eword' as as word separator. | |
Returns | |
------- | |
scores : dict | |
A dictionary with the following entries: | |
* 'type_fscore' | |
* 'type_precision' | |
* 'type_recall' | |
* 'token_fscore' | |
* 'token_precision' | |
* 'token_recall' | |
* 'boundary_all_fscore' | |
* 'boundary_all_precision' | |
* 'boundary_all_recall' | |
* 'boundary_noedge_fscore' | |
* 'boundary_noedge_precision' | |
* 'boundary_noedge_recall' | |
Raises | |
------ | |
ValueError | |
If `gold` and `text` have different size or differ in tokens | |
""" | |
text_words, text_stringpos, text_lex = self._process_data(predictions) | |
gold_words, gold_stringpos, gold_lex = self._process_data(references) | |
if len(gold_words) != len(text_words): | |
raise ValueError( | |
"gold and train have different size: len(gold)={}, len(train)={}".format( | |
len(gold_words), len(text_words) | |
) | |
) | |
for i, (g, t) in enumerate(zip(gold_words, text_words)): | |
if g != t: | |
raise ValueError( | |
'gold and train differ at line {}: gold="{}", train="{}"'.format( | |
i + 1, g, t | |
) | |
) | |
# token evaluation | |
token_eval = TokenEvaluation() | |
token_eval.update_lists(text_stringpos, gold_stringpos) | |
# type evaluation | |
type_eval = TypeEvaluation() | |
type_eval.update_lists(text_lex, gold_lex) | |
# boundary evaluation (with edges) | |
boundary_eval = BoundaryEvaluation() | |
boundary_eval.update_lists(text_stringpos, gold_stringpos) | |
# boundary evaluation (no edges) | |
boundary_noedge_eval = BoundaryNoEdgeEvaluation() | |
boundary_noedge_eval.update_lists(text_stringpos, gold_stringpos) | |
return { | |
"token_precision": token_eval.precision(), | |
"token_recall": token_eval.recall(), | |
"token_fscore": token_eval.fscore(), | |
"type_precision": type_eval.precision(), | |
"type_recall": type_eval.recall(), | |
"type_fscore": type_eval.fscore(), | |
"boundary_all_precision": boundary_eval.precision(), | |
"boundary_all_recall": boundary_eval.recall(), | |
"boundary_all_fscore": boundary_eval.fscore(), | |
"boundary_noedge_precision": boundary_noedge_eval.precision(), | |
"boundary_noedge_recall": boundary_noedge_eval.recall(), | |
"boundary_noedge_fscore": boundary_noedge_eval.fscore(), | |
} |