Spaces:

phonemetransformers
/

segmentation_scores

Runtime error

File size: 12,514 Bytes

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Segmentation scores evaluation metrics"""

import evaluate
import datasets


# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}
"""

# TODO: Add description of the module here
_DESCRIPTION = """\
This module computes segmentation scores for a list of predicted segmentations and gold segmentations.
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates how good are predicted segmentations, using boundary, token and type scores.
Args:
    predictions: list of segmented utterances to score. Each predictions
        should be a string with phonemes separated by spaces and estimated word boundaries
        denoted by the token 'WORD_BOUNDARY'.
    references: list of segmented utterances to score. Each predictions
        should be a string with phonemes separated by spaces and gold word boundaries
        denoted by the token 'WORD_BOUNDARY'.
Returns:
    type_fscore: lexicon f1 score
    type_precision: lexicon precision
    type_recall: lexicon recall
    token_fscore: token f1 score
    token_precision: token precision
    token_recall: token recall
    boundary_all_fscore: boundary f1 score, including utterance boundaries
    boundary_all_precision: boundary precision, including utterance boundaries
    boundary_all_recall: boundary recall, including utterance boundaries
    boundary_noedge_fscore: boundary f1 score, excluding utterance boundaries
    boundary_noedge_precision: boundary precision, excluding utterance boundaries
    boundary_noedge_recall: boundary recall, excluding utterance boundaries
Examples:
    >>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores")
    >>> results = segmentation_scores.compute(references=["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"], predictions=["w ɛ ɹ WORD_BOUNDARY ɪ z WORD_BOUNDARY ð ɪ s WORD_BOUNDARY", "l ɪ ɾ əl WORD_BOUNDARY aɪ z WORD_BOUNDARY"])
    >>> print(results)
    {'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0}
"""

class TokenEvaluation(object):
    """Evaluation of token f-score, precision and recall"""

    def __init__(self):
        self.test = 0
        self.gold = 0
        self.correct = 0
        self.n = 0
        self.n_exactmatch = 0

    def precision(self):
        return float(self.correct) / self.test if self.test != 0 else None

    def recall(self):
        return float(self.correct) / self.gold if self.gold != 0 else None

    def fscore(self):
        total = self.test + self.gold
        return float(2 * self.correct) / total if total != 0 else None

    def exact_match(self):
        return float(self.n_exactmatch) / self.n if self.n else None

    def update(self, test_set, gold_set):
        self.n += 1

        if test_set == gold_set:
            self.n_exactmatch += 1

        # omit empty items for type scoring (should not affect token
        # scoring). Type lists are prepared with '_' where there is no
        # match, to keep list lengths the same
        self.test += len([x for x in test_set if x != "_"])
        self.gold += len([x for x in gold_set if x != "_"])
        self.correct += len(test_set & gold_set)

    def update_lists(self, test_sets, gold_sets):
        if len(test_sets) != len(gold_sets):
            raise ValueError(
                "#words different in test and gold: {} != {}".format(
                    len(test_sets), len(gold_sets)
                )
            )

        for t, g in zip(test_sets, gold_sets):
            self.update(t, g)


class TypeEvaluation(TokenEvaluation):
    """Evaluation of type f-score, precision and recall"""

    @staticmethod
    def lexicon_check(textlex, goldlex):
        """Compare hypothesis and gold lexicons"""
        textlist = []
        goldlist = []
        for w in textlex:
            if w in goldlex:
                # set up matching lists for the true positives
                textlist.append(w)
                goldlist.append(w)
            else:
                # false positives
                textlist.append(w)
                # ensure matching null element in text list
                goldlist.append("_")

        for w in goldlex:
            if w not in goldlist:
                # now for the false negatives
                goldlist.append(w)
                # ensure matching null element in text list
                textlist.append("_")

        textset = [{w} for w in textlist]
        goldset = [{w} for w in goldlist]
        return textset, goldset

    def update_lists(self, text, gold):
        lt, lg = self.lexicon_check(text, gold)
        super(TypeEvaluation, self).update_lists(lt, lg)


class BoundaryEvaluation(TokenEvaluation):
    @staticmethod
    def get_boundary_positions(stringpos):
        return [{idx for pair in line for idx in pair} for line in stringpos]

    def update_lists(self, text, gold):
        lt = self.get_boundary_positions(text)
        lg = self.get_boundary_positions(gold)
        super(BoundaryEvaluation, self).update_lists(lt, lg)


class BoundaryNoEdgeEvaluation(BoundaryEvaluation):
    @staticmethod
    def get_boundary_positions(stringpos):
        return [{left for left, _ in line if left > 0} for line in stringpos]


class _StringPos(object):
    """Compute start and stop index of words in an utterance"""

    def __init__(self):
        self.idx = 0

    def __call__(self, n):
        """Return the position of the current word given its length `n`"""
        start = self.idx
        self.idx += n
        return start, self.idx
    


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class segmentation_scores(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features({
                'predictions': datasets.Value('string'),
                'references': datasets.Value('string'),
            }),
            # Homepage of the module for documentation
            homepage="https://huggingface.co/spaces/transformersegmentation/segmentation_scores",
            # Additional links to the codebase or references
            codebase_urls=["http://github.com/codebyzeb/transformersegmentation"],
            reference_urls=["http://path.to.reference.url/new_module"]
        )

    def _download_and_prepare(self, dl_manager):
        """Optional: download external resources useful to compute the scores"""
        # TODO: Download external resources if needed
        pass

    def _process_data(self, text):
        """ Load text data for evaluation
        Parameters
        ----------
        text : list of str
            The list of utterances to read for the evaluation.

        Returns
        -------
        (words, positions, lexicon) : three lists
            where `words` are the input utterances with word separators
            removed, `positions` stores the start/stop index of each word
            for each utterance, and `lexicon` is the list of words.
        """
        words = []
        positions = []
        lexicon = {}

        # ignore empty lines
        for utt in (utt for utt in text if utt.strip()):
            # list of phones in the utterance with word seperator removed
            phone_in_utterance = [
                phone for phone in utt.split(" ") if phone != "WORD_BOUNDARY"
            ]
            words_in_utterance = (
                "".join(
                    " " if phone == "WORD_BOUNDARY" else phone for phone in utt.split(" ")
                )
                .strip()
                .split(" ")
            )

            words.append(phone_in_utterance)
            for word in words_in_utterance:
                lexicon[word] = 1
            idx = _StringPos()
            positions.append({idx(len(word)) for word in words_in_utterance})

        # return the words lexicon as a sorted list
        lexicon = sorted([k for k in lexicon.keys()])
        return words, positions, lexicon
    
    def _compute(self, predictions, references):
        """Scores a segmented text against its gold version
        Parameters
        ----------
        predictions : sequence of str
            A suite of word utterances, each string using 'WORD_BOUNDARY' as as word separator.
        references : sequence of str
            A suite of word utterances, each string using 'WORD_BOUNDARY' as as word separator.

        Returns
        -------
        scores : dict
            A dictionary with the following entries:
            * 'type_fscore'
            * 'type_precision'
            * 'type_recall'
            * 'token_fscore'
            * 'token_precision'
            * 'token_recall'
            * 'boundary_all_fscore'
            * 'boundary_all_precision'
            * 'boundary_all_recall'
            * 'boundary_noedge_fscore'
            * 'boundary_noedge_precision'
            * 'boundary_noedge_recall'

        Raises
        ------
        ValueError
            If `gold` and `text` have different size or differ in tokens
        """
        text_words, text_stringpos, text_lex = self._process_data(predictions)
        gold_words, gold_stringpos, gold_lex = self._process_data(references)

        if len(gold_words) != len(text_words):
            raise ValueError(
                "gold and train have different size: len(gold)={}, len(train)={}".format(
                    len(gold_words), len(text_words)
                )
            )

        for i, (g, t) in enumerate(zip(gold_words, text_words)):
            if g != t:
                raise ValueError(
                    'gold and train differ at line {}: gold="{}", train="{}"'.format(
                        i + 1, g, t
                    )
                )

        # token evaluation
        token_eval = TokenEvaluation()
        token_eval.update_lists(text_stringpos, gold_stringpos)

        # type evaluation
        type_eval = TypeEvaluation()
        type_eval.update_lists(text_lex, gold_lex)

        # boundary evaluation (with edges)
        boundary_eval = BoundaryEvaluation()
        boundary_eval.update_lists(text_stringpos, gold_stringpos)

        # boundary evaluation (no edges)
        boundary_noedge_eval = BoundaryNoEdgeEvaluation()
        boundary_noedge_eval.update_lists(text_stringpos, gold_stringpos)

        return {
            "token_precision": token_eval.precision(),
            "token_recall": token_eval.recall(),
            "token_fscore": token_eval.fscore(),
            "type_precision": type_eval.precision(),
            "type_recall": type_eval.recall(),
            "type_fscore": type_eval.fscore(),
            "boundary_all_precision": boundary_eval.precision(),
            "boundary_all_recall": boundary_eval.recall(),
            "boundary_all_fscore": boundary_eval.fscore(),
            "boundary_noedge_precision": boundary_noedge_eval.precision(),
            "boundary_noedge_recall": boundary_noedge_eval.recall(),
            "boundary_noedge_fscore": boundary_noedge_eval.fscore(),
        }