Spaces:

phonemetransformers
/

segmentation_scores

Runtime error

App Files Files Community

segmentation_scores / segmentation_scores.py

codebyzeb

Implement metric

1d59f5a over 1 year ago

raw

history blame

12.4 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Segmentation scores evaluation metrics"""

	import evaluate
	import datasets


	# TODO: Add BibTeX citation
	_CITATION = """\
	@InProceedings{huggingface:module,
	title = {A great new module},
	authors={huggingface, Inc.},
	year={2020}
	}
	"""

	# TODO: Add description of the module here
	_DESCRIPTION = """\
	This module computes segmentation scores for a list of predicted segmentations and gold segmentations.
	"""


	# TODO: Add description of the arguments of the module here
	_KWARGS_DESCRIPTION = """
	Calculates how good are predicted segmentations, using boundary, token and type scores.
	Args:
	predictions: list of segmented utterances to score. Each predictions
	should be a string with phonemes separated by spaces and estimated word boundaries
	denoted by the token ';eword'.
	references: list of segmented utterances to score. Each predictions
	should be a string with phonemes separated by spaces and gold word boundaries
	denoted by the token ';eword'.
	Returns:
	type_fscore: lexicon f1 score
	type_precision: lexicon precision
	type_recall: lexicon recall
	token_fscore: token f1 score
	token_precision: token precision
	token_recall: token recall
	boundary_all_fscore: boundary f1 score, including utterance boundaries
	boundary_all_precision: boundary precision, including utterance boundaries
	boundary_all_recall: boundary recall, including utterance boundaries
	boundary_noedge_fscore: boundary f1 score, excluding utterance boundaries
	boundary_noedge_precision: boundary precision, excluding utterance boundaries
	boundary_noedge_recall: boundary recall, excluding utterance boundaries
	Examples:
	>>> segmentation_scores = evaluate.load("transformersegmentation/segmentation_scores")
	>>> results = segmentation_scores.compute(references=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"], predictions=["w ɛ ɹ ;eword ɪ z ;eword ð ɪ s ;eword", "l ɪ ɾ əl ;eword aɪ z ;eword"])
	>>> print(results)
	{'type_fscore': 1.0, 'type_precision': 1.0, 'type_recall': 1.0, 'token_fscore': 1.0, 'token_precision': 1.0, 'token_recall': 1.0, 'boundary_all_fscore': 1.0, 'boundary_all_precision': 1.0, 'boundary_all_recall': 1.0, 'boundary_noedge_fscore': 1.0, 'boundary_noedge_precision': 1.0, 'boundary_noedge_recall': 1.0}
	"""

	class TokenEvaluation(object):
	"""Evaluation of token f-score, precision and recall"""

	def __init__(self):
	self.test = 0
	self.gold = 0
	self.correct = 0
	self.n = 0
	self.n_exactmatch = 0

	def precision(self):
	return float(self.correct) / self.test if self.test != 0 else None

	def recall(self):
	return float(self.correct) / self.gold if self.gold != 0 else None

	def fscore(self):
	total = self.test + self.gold
	return float(2 * self.correct) / total if total != 0 else None

	def exact_match(self):
	return float(self.n_exactmatch) / self.n if self.n else None

	def update(self, test_set, gold_set):
	self.n += 1

	if test_set == gold_set:
	self.n_exactmatch += 1

	# omit empty items for type scoring (should not affect token
	# scoring). Type lists are prepared with '_' where there is no
	# match, to keep list lengths the same
	self.test += len([x for x in test_set if x != "_"])
	self.gold += len([x for x in gold_set if x != "_"])
	self.correct += len(test_set & gold_set)

	def update_lists(self, test_sets, gold_sets):
	if len(test_sets) != len(gold_sets):
	raise ValueError(
	"#words different in test and gold: {} != {}".format(
	len(test_sets), len(gold_sets)
	)
	)

	for t, g in zip(test_sets, gold_sets):
	self.update(t, g)


	class TypeEvaluation(TokenEvaluation):
	"""Evaluation of type f-score, precision and recall"""

	@staticmethod
	def lexicon_check(textlex, goldlex):
	"""Compare hypothesis and gold lexicons"""
	textlist = []
	goldlist = []
	for w in textlex:
	if w in goldlex:
	# set up matching lists for the true positives
	textlist.append(w)
	goldlist.append(w)
	else:
	# false positives
	textlist.append(w)
	# ensure matching null element in text list
	goldlist.append("_")

	for w in goldlex:
	if w not in goldlist:
	# now for the false negatives
	goldlist.append(w)
	# ensure matching null element in text list
	textlist.append("_")

	textset = [{w} for w in textlist]
	goldset = [{w} for w in goldlist]
	return textset, goldset

	def update_lists(self, text, gold):
	lt, lg = self.lexicon_check(text, gold)
	super(TypeEvaluation, self).update_lists(lt, lg)


	class BoundaryEvaluation(TokenEvaluation):
	@staticmethod
	def get_boundary_positions(stringpos):
	return [{idx for pair in line for idx in pair} for line in stringpos]

	def update_lists(self, text, gold):
	lt = self.get_boundary_positions(text)
	lg = self.get_boundary_positions(gold)
	super(BoundaryEvaluation, self).update_lists(lt, lg)


	class BoundaryNoEdgeEvaluation(BoundaryEvaluation):
	@staticmethod
	def get_boundary_positions(stringpos):
	return [{left for left, _ in line if left > 0} for line in stringpos]


	class _StringPos(object):
	"""Compute start and stop index of words in an utterance"""

	def __init__(self):
	self.idx = 0

	def __call__(self, n):
	"""Return the position of the current word given its length `n`"""
	start = self.idx
	self.idx += n
	return start, self.idx



	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class segmentation_scores(evaluate.Metric):
	"""TODO: Short description of my evaluation module."""

	def _info(self):
	# TODO: Specifies the evaluate.EvaluationModuleInfo object
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features({
	'predictions': datasets.Value('string'),
	'references': datasets.Value('string'),
	}),
	# Homepage of the module for documentation
	homepage="https://huggingface.co/spaces/transformersegmentation/segmentation_scores",
	# Additional links to the codebase or references
	codebase_urls=["http://github.com/codebyzeb/transformersegmentation"],
	reference_urls=["http://path.to.reference.url/new_module"]
	)

	def _download_and_prepare(self, dl_manager):
	"""Optional: download external resources useful to compute the scores"""
	# TODO: Download external resources if needed
	pass

	def _process_data(self, text):
	""" Load text data for evaluation
	Parameters
	----------
	text : list of str
	The list of utterances to read for the evaluation.

	Returns
	-------
	(words, positions, lexicon) : three lists
	where `words` are the input utterances with word separators
	removed, `positions` stores the start/stop index of each word
	for each utterance, and `lexicon` is the list of words.
	"""
	words = []
	positions = []
	lexicon = {}

	# ignore empty lines
	for utt in (utt for utt in text if utt.strip()):
	# list of phones in the utterance with word seperator removed
	phone_in_utterance = [
	phone for phone in utt.split(" ") if phone != ";eword"
	]
	words_in_utterance = (
	"".join(
	" " if phone == ";eword" else phone for phone in utt.split(" ")
	)
	.strip()
	.split(" ")
	)

	words.append(phone_in_utterance)
	for word in words_in_utterance:
	lexicon[word] = 1
	idx = _StringPos()
	positions.append({idx(len(word)) for word in words_in_utterance})

	# return the words lexicon as a sorted list
	lexicon = sorted([k for k in lexicon.keys()])
	return words, positions, lexicon

	def _compute(self, predictions, references):
	"""Scores a segmented text against its gold version
	Parameters
	----------
	predictions : sequence of str
	A suite of word utterances, each string using ';eword' as as word separator.
	references : sequence of str
	A suite of word utterances, each string using ';eword' as as word separator.

	Returns
	-------
	scores : dict
	A dictionary with the following entries:
	* 'type_fscore'
	* 'type_precision'
	* 'type_recall'
	* 'token_fscore'
	* 'token_precision'
	* 'token_recall'
	* 'boundary_all_fscore'
	* 'boundary_all_precision'
	* 'boundary_all_recall'
	* 'boundary_noedge_fscore'
	* 'boundary_noedge_precision'
	* 'boundary_noedge_recall'

	Raises
	------
	ValueError
	If `gold` and `text` have different size or differ in tokens
	"""
	text_words, text_stringpos, text_lex = self._process_data(predictions)
	gold_words, gold_stringpos, gold_lex = self._process_data(references)

	if len(gold_words) != len(text_words):
	raise ValueError(
	"gold and train have different size: len(gold)={}, len(train)={}".format(
	len(gold_words), len(text_words)
	)
	)

	for i, (g, t) in enumerate(zip(gold_words, text_words)):
	if g != t:
	raise ValueError(
	'gold and train differ at line {}: gold="{}", train="{}"'.format(
	i + 1, g, t
	)
	)

	# token evaluation
	token_eval = TokenEvaluation()
	token_eval.update_lists(text_stringpos, gold_stringpos)

	# type evaluation
	type_eval = TypeEvaluation()
	type_eval.update_lists(text_lex, gold_lex)

	# boundary evaluation (with edges)
	boundary_eval = BoundaryEvaluation()
	boundary_eval.update_lists(text_stringpos, gold_stringpos)

	# boundary evaluation (no edges)
	boundary_noedge_eval = BoundaryNoEdgeEvaluation()
	boundary_noedge_eval.update_lists(text_stringpos, gold_stringpos)

	return {
	"token_precision": token_eval.precision(),
	"token_recall": token_eval.recall(),
	"token_fscore": token_eval.fscore(),
	"type_precision": type_eval.precision(),
	"type_recall": type_eval.recall(),
	"type_fscore": type_eval.fscore(),
	"boundary_all_precision": boundary_eval.precision(),
	"boundary_all_recall": boundary_eval.recall(),
	"boundary_all_fscore": boundary_eval.fscore(),
	"boundary_noedge_precision": boundary_noedge_eval.precision(),
	"boundary_noedge_recall": boundary_noedge_eval.recall(),
	"boundary_noedge_fscore": boundary_noedge_eval.fscore(),
	}