segmentation_scores / segmentation_scores.py
codebyzeb's picture
Implement metric
1d59f5a
raw
history blame
12.4 kB
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Segmentation scores evaluation metrics"""
import evaluate
import datasets
# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}
"""
# TODO: Add description of the module here
_DESCRIPTION = """\
This module computes segmentation scores for a list of predicted segmentations and gold segmentations.
"""
# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates how good are predicted segmentations, using boundary, token and type scores.
Args:
predictions: list of segmented utterances to score. Each predictions
should be a string with phonemes separated by spaces and estimated word boundaries
denoted by the token ';eword'.
references: list of segmented utterances to score. Each predictions
should be a string with phonemes separated by spaces and gold word boundaries
denoted by the token ';eword'.
Returns:
type_fscore: lexicon f1 score
type_precision: lexicon precision
type_recall: lexicon recall
token_fscore: token f1 score
token_precision: token precision
token_recall: token recall
boundary_all_fscore: boundary f1 score, including utterance boundaries
boundary_all_precision: boundary precision, including utterance boundaries
boundary_all_recall: boundary recall, including utterance boundaries
boundary_noedge_fscore: boundary f1 score, excluding utterance boundaries
boundary_noedge_precision: boundary precision, excluding utterance boundaries
boundary_noedge_recall: boundary recall, excluding utterance boundaries
Examples:
>>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores")
>>> results = segmentation_scores.compute(references=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"], predictions=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"])
>>> print(results)
{'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0}
"""
class TokenEvaluation(object):
"""Evaluation of token f-score, precision and recall"""
def __init__(self):
self.test = 0
self.gold = 0
self.correct = 0
self.n = 0
self.n_exactmatch = 0
def precision(self):
return float(self.correct) / self.test if self.test != 0 else None
def recall(self):
return float(self.correct) / self.gold if self.gold != 0 else None
def fscore(self):
total = self.test + self.gold
return float(2 * self.correct) / total if total != 0 else None
def exact_match(self):
return float(self.n_exactmatch) / self.n if self.n else None
def update(self, test_set, gold_set):
self.n += 1
if test_set == gold_set:
self.n_exactmatch += 1
# omit empty items for type scoring (should not affect token
# scoring). Type lists are prepared with '_' where there is no
# match, to keep list lengths the same
self.test += len([x for x in test_set if x != "_"])
self.gold += len([x for x in gold_set if x != "_"])
self.correct += len(test_set & gold_set)
def update_lists(self, test_sets, gold_sets):
if len(test_sets) != len(gold_sets):
raise ValueError(
"#words different in test and gold: {} != {}".format(
len(test_sets), len(gold_sets)
)
)
for t, g in zip(test_sets, gold_sets):
self.update(t, g)
class TypeEvaluation(TokenEvaluation):
"""Evaluation of type f-score, precision and recall"""
@staticmethod
def lexicon_check(textlex, goldlex):
"""Compare hypothesis and gold lexicons"""
textlist = []
goldlist = []
for w in textlex:
if w in goldlex:
# set up matching lists for the true positives
textlist.append(w)
goldlist.append(w)
else:
# false positives
textlist.append(w)
# ensure matching null element in text list
goldlist.append("_")
for w in goldlex:
if w not in goldlist:
# now for the false negatives
goldlist.append(w)
# ensure matching null element in text list
textlist.append("_")
textset = [{w} for w in textlist]
goldset = [{w} for w in goldlist]
return textset, goldset
def update_lists(self, text, gold):
lt, lg = self.lexicon_check(text, gold)
super(TypeEvaluation, self).update_lists(lt, lg)
class BoundaryEvaluation(TokenEvaluation):
@staticmethod
def get_boundary_positions(stringpos):
return [{idx for pair in line for idx in pair} for line in stringpos]
def update_lists(self, text, gold):
lt = self.get_boundary_positions(text)
lg = self.get_boundary_positions(gold)
super(BoundaryEvaluation, self).update_lists(lt, lg)
class BoundaryNoEdgeEvaluation(BoundaryEvaluation):
@staticmethod
def get_boundary_positions(stringpos):
return [{left for left, _ in line if left > 0} for line in stringpos]
class _StringPos(object):
"""Compute start and stop index of words in an utterance"""
def __init__(self):
self.idx = 0
def __call__(self, n):
"""Return the position of the current word given its length `n`"""
start = self.idx
self.idx += n
return start, self.idx
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class segmentation_scores(evaluate.Metric):
"""TODO: Short description of my evaluation module."""
def _info(self):
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features({
'predictions': datasets.Value('string'),
'references': datasets.Value('string'),
}),
# Homepage of the module for documentation
homepage="https://huggingface.co/spaces/transformersegmentation/segmentation_scores",
# Additional links to the codebase or references
codebase_urls=["http://github.com/codebyzeb/transformersegmentation"],
reference_urls=["http://path.to.reference.url/new_module"]
)
def _download_and_prepare(self, dl_manager):
"""Optional: download external resources useful to compute the scores"""
# TODO: Download external resources if needed
pass
def _process_data(self, text):
""" Load text data for evaluation
Parameters
----------
text : list of str
The list of utterances to read for the evaluation.
Returns
-------
(words, positions, lexicon) : three lists
where `words` are the input utterances with word separators
removed, `positions` stores the start/stop index of each word
for each utterance, and `lexicon` is the list of words.
"""
words = []
positions = []
lexicon = {}
# ignore empty lines
for utt in (utt for utt in text if utt.strip()):
# list of phones in the utterance with word seperator removed
phone_in_utterance = [
phone for phone in utt.split(" ") if phone != ";eword"
]
words_in_utterance = (
"".join(
" " if phone == ";eword" else phone for phone in utt.split(" ")
)
.strip()
.split(" ")
)
words.append(phone_in_utterance)
for word in words_in_utterance:
lexicon[word] = 1
idx = _StringPos()
positions.append({idx(len(word)) for word in words_in_utterance})
# return the words lexicon as a sorted list
lexicon = sorted([k for k in lexicon.keys()])
return words, positions, lexicon
def _compute(self, predictions, references):
"""Scores a segmented text against its gold version
Parameters
----------
predictions : sequence of str
A suite of word utterances, each string using ';eword' as as word separator.
references : sequence of str
A suite of word utterances, each string using ';eword' as as word separator.
Returns
-------
scores : dict
A dictionary with the following entries:
* 'type_fscore'
* 'type_precision'
* 'type_recall'
* 'token_fscore'
* 'token_precision'
* 'token_recall'
* 'boundary_all_fscore'
* 'boundary_all_precision'
* 'boundary_all_recall'
* 'boundary_noedge_fscore'
* 'boundary_noedge_precision'
* 'boundary_noedge_recall'
Raises
------
ValueError
If `gold` and `text` have different size or differ in tokens
"""
text_words, text_stringpos, text_lex = self._process_data(predictions)
gold_words, gold_stringpos, gold_lex = self._process_data(references)
if len(gold_words) != len(text_words):
raise ValueError(
"gold and train have different size: len(gold)={}, len(train)={}".format(
len(gold_words), len(text_words)
)
)
for i, (g, t) in enumerate(zip(gold_words, text_words)):
if g != t:
raise ValueError(
'gold and train differ at line {}: gold="{}", train="{}"'.format(
i + 1, g, t
)
)
# token evaluation
token_eval = TokenEvaluation()
token_eval.update_lists(text_stringpos, gold_stringpos)
# type evaluation
type_eval = TypeEvaluation()
type_eval.update_lists(text_lex, gold_lex)
# boundary evaluation (with edges)
boundary_eval = BoundaryEvaluation()
boundary_eval.update_lists(text_stringpos, gold_stringpos)
# boundary evaluation (no edges)
boundary_noedge_eval = BoundaryNoEdgeEvaluation()
boundary_noedge_eval.update_lists(text_stringpos, gold_stringpos)
return {
"token_precision": token_eval.precision(),
"token_recall": token_eval.recall(),
"token_fscore": token_eval.fscore(),
"type_precision": type_eval.precision(),
"type_recall": type_eval.recall(),
"type_fscore": type_eval.fscore(),
"boundary_all_precision": boundary_eval.precision(),
"boundary_all_recall": boundary_eval.recall(),
"boundary_all_fscore": boundary_eval.fscore(),
"boundary_noedge_precision": boundary_noedge_eval.precision(),
"boundary_noedge_recall": boundary_noedge_eval.recall(),
"boundary_noedge_fscore": boundary_noedge_eval.fscore(),
}