# -*- coding:utf-8 -*- """Collection of statistics functions. """ import numpy as np def percentage_agreement_pov(total_pov: int, total_annotations: int) -> float: """Computes a percentage :param total_pov: total agree/disagree annotations :type total_pov: int :param total_annotations: total annotations in project :type total_annotations: int :rtype: float :return: agreement percentage """ return round((total_pov / total_annotations) * 100, 2) def fleiss_kappa_function(matrix: list) -> float: """Computes Fleiss' kappa for group of annotators. :param matrix: a matrix of shape (:attr:'N', :attr:'k') with 'N' = number of subjects and 'k' = the number of categories. 'M[i, j]' represent the number of raters who assigned the 'i'th subject to the 'j'th category. :type matrix: numpy matrix :rtype: float :return: Fleiss' kappa score """ N, _ = matrix.shape # N is # of items, k is # of categories n_annotators = float(np.sum(matrix[0, :])) # # of annotators tot_annotations = N * n_annotators # the total # of annotations category_sum = np.sum(matrix, axis=0) # the sum of each category over all items # chance agreement p = category_sum / tot_annotations # the distribution of each category over all annotations PbarE = np.sum(p * p) # average chance agreement over all categories # observed agreement P = (np.sum(matrix * matrix, axis=1) - n_annotators) / (n_annotators * (n_annotators - 1)) Pbar = np.sum(P) / N # add all observed agreement # chances per item and divide by amount of items return round((Pbar - PbarE) / (1 - PbarE), 4) def cohen_kappa_function(ann1: list, ann2: list) -> float: """Computes Cohen kappa for pair-wise annotators. :param ann1: annotations provided by first annotator :type ann1: list :param ann2: annotations provided by second annotator :type ann2: list :rtype: float :return: Cohen kappa statistic """ count = 0 for an1, an2 in zip(ann1, ann2): if an1 == an2: count += 1 A = count / len(ann1) # observed agreement A (Po) uniq = set(ann1 + ann2) E = 0 # expected agreement E (Pe) for item in uniq: cnt1 = ann1.count(item) cnt2 = ann2.count(item) count = (cnt1 / len(ann1)) * (cnt2 / len(ann2)) E += count return round((A - E) / (1 - E), 4)