Spaces:
Running
Running
""" | |
This script applies to IOB2 or IOBES tagging scheme. | |
If you are using a different scheme, please convert to IOB2 or IOBES. | |
IOB2: | |
- B = begin, | |
- I = inside but not the first, | |
- O = outside | |
e.g. | |
John lives in New York City . | |
B-PER O O B-LOC I-LOC I-LOC O | |
IOBES: | |
- B = begin, | |
- E = end, | |
- S = singleton, | |
- I = inside but not the first or the last, | |
- O = outside | |
e.g. | |
John lives in New York City . | |
S-PER O O B-LOC I-LOC E-LOC O | |
prefix: IOBES | |
chunk_type: PER, LOC, etc. | |
""" | |
from __future__ import division, print_function, unicode_literals | |
import sys | |
from collections import defaultdict | |
def split_tag(chunk_tag): | |
""" | |
split chunk tag into IOBES prefix and chunk_type | |
e.g. | |
B-PER -> (B, PER) | |
O -> (O, None) | |
""" | |
if chunk_tag == 'O': | |
return ('O', None) | |
return chunk_tag.split('-', maxsplit=1) | |
def is_chunk_end(prev_tag, tag): | |
""" | |
check if the previous chunk ended between the previous and current word | |
e.g. | |
(B-PER, I-PER) -> False | |
(B-LOC, O) -> True | |
Note: in case of contradicting tags, e.g. (B-PER, I-LOC) | |
this is considered as (B-PER, B-LOC) | |
""" | |
prefix1, chunk_type1 = split_tag(prev_tag) | |
prefix2, chunk_type2 = split_tag(tag) | |
if prefix1 == 'O': | |
return False | |
if prefix2 == 'O': | |
return prefix1 != 'O' | |
if chunk_type1 != chunk_type2: | |
return True | |
return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S'] | |
def is_chunk_start(prev_tag, tag): | |
""" | |
check if a new chunk started between the previous and current word | |
""" | |
prefix1, chunk_type1 = split_tag(prev_tag) | |
prefix2, chunk_type2 = split_tag(tag) | |
if prefix2 == 'O': | |
return False | |
if prefix1 == 'O': | |
return prefix2 != 'O' | |
if chunk_type1 != chunk_type2: | |
return True | |
return prefix2 in ['B', 'S'] or prefix1 in ['E', 'S'] | |
def calc_metrics(tp, p, t, percent=True): | |
""" | |
compute overall precision, recall and FB1 (default values are 0.0) | |
if percent is True, return 100 * original decimal value | |
""" | |
precision = tp / p if p else 0 | |
recall = tp / t if t else 0 | |
fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0 | |
if percent: | |
return 100 * precision, 100 * recall, 100 * fb1 | |
else: | |
return precision, recall, fb1 | |
def count_chunks(true_seqs, pred_seqs): | |
""" | |
true_seqs: a list of true tags | |
pred_seqs: a list of predicted tags | |
return: | |
correct_chunks: a dict (counter), | |
key = chunk types, | |
value = number of correctly identified chunks per type | |
true_chunks: a dict, number of true chunks per type | |
pred_chunks: a dict, number of identified chunks per type | |
correct_counts, true_counts, pred_counts: similar to above, but for tags | |
""" | |
correct_chunks = defaultdict(int) | |
true_chunks = defaultdict(int) | |
pred_chunks = defaultdict(int) | |
correct_counts = defaultdict(int) | |
true_counts = defaultdict(int) | |
pred_counts = defaultdict(int) | |
prev_true_tag, prev_pred_tag = 'O', 'O' | |
correct_chunk = None | |
for true_tag, pred_tag in zip(true_seqs, pred_seqs): | |
if true_tag == pred_tag: | |
correct_counts[true_tag] += 1 | |
true_counts[true_tag] += 1 | |
pred_counts[pred_tag] += 1 | |
_, true_type = split_tag(true_tag) | |
_, pred_type = split_tag(pred_tag) | |
if correct_chunk is not None: | |
true_end = is_chunk_end(prev_true_tag, true_tag) | |
pred_end = is_chunk_end(prev_pred_tag, pred_tag) | |
if pred_end and true_end: | |
correct_chunks[correct_chunk] += 1 | |
correct_chunk = None | |
elif pred_end != true_end or true_type != pred_type: | |
correct_chunk = None | |
true_start = is_chunk_start(prev_true_tag, true_tag) | |
pred_start = is_chunk_start(prev_pred_tag, pred_tag) | |
if true_start and pred_start and true_type == pred_type: | |
correct_chunk = true_type | |
if true_start: | |
true_chunks[true_type] += 1 | |
if pred_start: | |
pred_chunks[pred_type] += 1 | |
prev_true_tag, prev_pred_tag = true_tag, pred_tag | |
if correct_chunk is not None: | |
correct_chunks[correct_chunk] += 1 | |
return (correct_chunks, true_chunks, pred_chunks, | |
correct_counts, true_counts, pred_counts) | |
def get_result(correct_chunks, true_chunks, pred_chunks, | |
correct_counts, true_counts, pred_counts, verbose=True): | |
""" | |
if verbose, print overall performance, as well as preformance per chunk type; | |
otherwise, simply return overall prec, rec, f1 scores | |
""" | |
# sum counts | |
sum_correct_chunks = sum(correct_chunks.values()) | |
sum_true_chunks = sum(true_chunks.values()) | |
sum_pred_chunks = sum(pred_chunks.values()) | |
sum_correct_counts = sum(correct_counts.values()) | |
sum_true_counts = sum(true_counts.values()) | |
nonO_correct_counts = sum(v for k, v in correct_counts.items() if k != 'O') | |
nonO_true_counts = sum(v for k, v in true_counts.items() if k != 'O') | |
chunk_types = sorted(list(set(list(true_chunks) + list(pred_chunks)))) | |
# compute overall precision, recall and FB1 (default values are 0.0) | |
prec, rec, f1 = calc_metrics(sum_correct_chunks, sum_pred_chunks, sum_true_chunks) | |
res = (prec, rec, f1) | |
if not verbose: | |
return res | |
# print overall performance, and performance per chunk type | |
print("processed %i tokens with %i phrases; " % (sum_true_counts, sum_true_chunks), end='') | |
print("found: %i phrases; correct: %i.\n" % (sum_pred_chunks, sum_correct_chunks), end='') | |
print("accuracy: %6.2f%%; (non-O)" % (100*nonO_correct_counts/nonO_true_counts)) | |
print("accuracy: %6.2f%%; " % (100*sum_correct_counts/sum_true_counts), end='') | |
print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % (prec, rec, f1)) | |
# for each chunk type, compute precision, recall and FB1 (default values are 0.0) | |
for t in chunk_types: | |
prec, rec, f1 = calc_metrics(correct_chunks[t], pred_chunks[t], true_chunks[t]) | |
print("%17s: " %t , end='') | |
print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" % | |
(prec, rec, f1), end='') | |
print(" %d" % pred_chunks[t]) | |
return res | |
# you can generate LaTeX output for tables like in | |
# http://cnts.uia.ac.be/conll2003/ner/example.tex | |
# but I'm not implementing this | |
def evaluate(true_seqs, pred_seqs, verbose=True): | |
(correct_chunks, true_chunks, pred_chunks, | |
correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs) | |
result = get_result(correct_chunks, true_chunks, pred_chunks, | |
correct_counts, true_counts, pred_counts, verbose=verbose) | |
return result | |
def evaluate_conll_file(fileIterator): | |
true_seqs, pred_seqs = [], [] | |
for line in fileIterator: | |
cols = line.strip().split() | |
# each non-empty line must contain >= 3 columns | |
if not cols: | |
true_seqs.append('O') | |
pred_seqs.append('O') | |
elif len(cols) < 3: | |
raise IOError("conlleval: too few columns in line %s\n" % line) | |
else: | |
# extract tags from last 2 columns | |
true_seqs.append(cols[-2]) | |
pred_seqs.append(cols[-1]) | |
return evaluate(true_seqs, pred_seqs) | |
if __name__ == '__main__': | |
""" | |
usage: conlleval < file | |
""" | |
evaluate_conll_file(sys.stdin) | |